In [58]:
sc.stop()

In [59]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [60]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "km") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [61]:
from pyspark.sql import Window
import pyspark.sql.functions as f

In [62]:
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, \
TimestampType, DateType, IntegerType, FloatType

In [63]:
from pyspark.ml.feature import OneHotEncoder

In [64]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2021-02-27 22:12 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2021-02-27 22:12 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2021-02-27 22:12 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2021-02-27 22:12 /labs/slaba03/laba03_views_programmes.csv


### Views

### 
schema_views = StructType(fields=[
    StructField("user_id", StringType()),
    StructField("item_id", StringType()),
    StructField("ts_start", IntegerType()),
    StructField("ts_end", IntegerType()),
    StructField("item_type", StringType())
])

###
views = spark.read.csv("/labs/slaba03/laba03_views_programmes.csv", header=True, schema=schema_views)
views.registerTempTable('views')
views.show(5)

### 
views = views.withColumn('diff', f.col('ts_end') - f.col('ts_start'))
views = views.withColumn('diff_sum_user_id', f.sum("diff").over(Window.partitionBy("user_id")).cast('decimal(10,0)'))
views = views.withColumn('diff_sum_item_id', f.sum("diff").over(Window.partitionBy("item_id")).cast('decimal(10,0)'))
views.show(4)

### Train

In [65]:
schema_train = StructType(fields=[
    StructField("user_id", StringType()),
    StructField("item_id", StringType()),
    StructField("purchase", IntegerType())
])

In [67]:
train = spark.read.csv("/labs/slaba03/laba03_train.csv", header=True, schema=schema_train)
train.registerTempTable('train')
train.cache()
train.show()

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
|   1654|  66187|       0|
|   1654|  84350|       0|
|   1654|  92854|       0|
|   1654|  72811|       0|
|   1654|  86876|       0|
|   1654| 102657|       0|
|   1654| 100482|       0|
|   1654|  89677|       0|
|   1654|  99419|       0|
|   1654|  66603|       0|
|   1654|   7363|       0|
|   1654|   1320|       0|
|   1654|  88892|       0|
|   1654|  66671|       0|
|   1654|  75925|       0|
+-------+-------+--------+
only showing top 20 rows



In [74]:
train.count()

5032624

In [69]:
train.select('user_id').distinct().count()

1941

In [73]:
train.select('item_id').distinct().count()

3704

In [75]:
1941 * 3704

7189464

In [76]:
7189464 - 5032624

2156840

### 
items = spark.read.csv("/labs/slaba03/laba03_items.csv", header=True, sep="\t")
items.registerTempTable('items')
items.show()

### 
train_ = train.join(items, on='item_id', how='left')

###
train_ = train.join(views.select('user_id', 'item_id', 'diff', 'diff_sum_user_id', 'diff_sum_item_id'),\
                    on=['user_id', 'item_id'], how='left')
train_.show(4)

In [16]:
train_.groupBy('purchase').count().show()

+--------+-------+
|purchase|  count|
+--------+-------+
|       0|5021720|
|       1|  10904|
+--------+-------+



In [17]:
print('Доля класса 1: {}'.format(10904 / (10904 + 5021720)))

Доля класса 1: 0.0021666629575346776


In [9]:
sum_ = train.groupBy("user_id").sum().select(f.col("sum(purchase)").alias("sum"), f.col("user_id")).cache()

hit = train.groupBy("item_id").sum().select(f.col("sum(purchase)").alias("hit"), f.col("item_id")).cache()

In [77]:
train = train.withColumn('sum', f.sum('purchase').over(Window.partitionBy('user_id')))
train = train.withColumn('hit', f.sum('purchase').over(Window.partitionBy('item_id')))
train = train.withColumn('sum2', f.col('sum') / f.count('purchase').over(Window.partitionBy('user_id')))
train.cache()

DataFrame[user_id: string, item_id: string, purchase: int, sum: bigint, hit: bigint, sum2: double]

In [78]:
train.show(4)

+-------+-------+--------+---+---+--------------------+
|user_id|item_id|purchase|sum|hit|                sum2|
+-------+-------+--------+---+---+--------------------+
| 867363| 100140|       0|  1|  0|3.892565200467107...|
| 867363|   2136|       0|  1|  1|3.892565200467107...|
| 867363|  60351|       0|  1|  2|3.892565200467107...|
| 867363|    691|       0|  1|  0|3.892565200467107...|
+-------+-------+--------+---+---+--------------------+
only showing top 4 rows



###
train_ = train_.withColumn('sum_purch_by_user', f.sum("purchase").over(Window.partitionBy("user_id")).cast('decimal(10,0)'))
train_ = train_.withColumn('sum_purch_by_item', f.sum("purchase").over(Window.partitionBy("item_id")).cast('decimal(10,0)'))
#train_ = train_.withColumn('genr_len', f.coalesce(f.length(f.col('genres')), f.lit(0)).cast('decimal(10,0)'))
train_ = train_.join(user_attempts, on = 'user_id')
train_ = train_.join(item_attempts, on = 'item_id')

train_.show(5)

In [70]:
test = spark.read.csv("/labs/slaba03/laba03_test.csv", header=True)
test.registerTempTable('test')
test.cache()
test.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|    null|
|   1654|  93629|    null|
|   1654|   9980|    null|
|   1654|  95099|    null|
|   1654|  11265|    null|
+-------+-------+--------+
only showing top 5 rows



In [71]:
test.select('user_id').distinct().count()

1941

In [72]:
test.select('item_id').distinct().count()

3704

In [13]:
test.count()

2156840

### join "hit"

In [79]:
test_2 = test.join(train.select('item_id', 'hit').distinct(), on="item_id", how='left')
test_2.cache()
test_2.count()

2156840

In [80]:
test_2.show(4)

+-------+-------+--------+---+
|item_id|user_id|purchase|hit|
+-------+-------+--------+---+
|  94814|   1654|    null|  1|
|  93629|   1654|    null|  4|
|   9980|   1654|    null|  1|
|  95099|   1654|    null|  1|
+-------+-------+--------+---+
only showing top 4 rows



### join "sum"

In [82]:
test_3 = test_2.join(train.select('user_id', 'sum').distinct(), on="user_id", how='left')
test_3.cache()
test_3.count()

2156840

In [83]:
test_3.show(5)

+-------+-------+--------+---+---+
|user_id|item_id|purchase|hit|sum|
+-------+-------+--------+---+---+
| 867363|  92649|    null|  1|  1|
| 867363|  84350|    null|  3|  1|
| 867363|  88833|    null|  1|  1|
| 867363|  74396|    null|  1|  1|
| 867363|  98560|    null|  1|  1|
+-------+-------+--------+---+---+
only showing top 5 rows



### join "sum_2" 

In [51]:
test_3.join(train.select('user_id', 'sum2'), on="user_id", how='left')

DataFrame[user_id: string, item_id: string, purchase: string, hit: bigint, sum: bigint, sum2: double]

In [84]:
%%time
test_4 = test_3.join(train.select('user_id', 'sum2').distinct(), on="user_id", how='left')
test_4.cache()
test_4.count()

CPU times: user 7.07 ms, sys: 0 ns, total: 7.07 ms
Wall time: 13.4 s


In [85]:
test_4.count()

2156840

In [86]:
test_4.show(4)

+-------+-------+--------+---+---+--------------------+
|user_id|item_id|purchase|hit|sum|                sum2|
+-------+-------+--------+---+---+--------------------+
| 867363|  92649|    null|  1|  1|3.892565200467107...|
| 867363|  84350|    null|  3|  1|3.892565200467107...|
| 867363|  88833|    null|  1|  1|3.892565200467107...|
| 867363|  74396|    null|  1|  1|3.892565200467107...|
+-------+-------+--------+---+---+--------------------+
only showing top 4 rows



In [88]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors

In [108]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier

In [90]:
# assembler1 = VectorAssembler(inputCols=['sum_purch_by_user', 'sum_purch_by_item', 'user_attempts', 'item_attempts', \
#                                        'user_addict', 'item_addict'], outputCol="features")
assembler2 = VectorAssembler(inputCols=['sum', 'hit', 'sum2'], outputCol="features")
train_model = assembler2.transform(train).cache()

In [91]:
train_model.show(4)

+-------+-------+--------+---+---+--------------------+--------------------+
|user_id|item_id|purchase|sum|hit|                sum2|            features|
+-------+-------+--------+---+---+--------------------+--------------------+
| 867363| 100140|       0|  1|  0|3.892565200467107...|[1.0,0.0,3.892565...|
| 867363|   2136|       0|  1|  1|3.892565200467107...|[1.0,1.0,3.892565...|
| 867363|  60351|       0|  1|  2|3.892565200467107...|[1.0,2.0,3.892565...|
| 867363|    691|       0|  1|  0|3.892565200467107...|[1.0,0.0,3.892565...|
+-------+-------+--------+---+---+--------------------+--------------------+
only showing top 4 rows



In [139]:
lr = LogisticRegression(featuresCol="features",  labelCol="purchase", maxIter=50, regParam=0.2)

In [109]:
rf_ = RandomForestClassifier(featuresCol="features",  labelCol="purchase", numTrees=41)

In [140]:
%%time
model_lr = lr.fit(train_model)

CPU times: user 38.3 ms, sys: 1.44 ms, total: 39.7 ms
Wall time: 1min 5s


In [110]:
model_rf = rf_.fit(train_model)

In [94]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prob", labelCol="purchase", metricName='areaUnderROC')

In [95]:
def prob(col):
    return float(col[1])

In [96]:
new_col = f.udf(prob, FloatType())

In [97]:
train_model.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- sum: long (nullable = true)
 |-- hit: long (nullable = true)
 |-- sum2: double (nullable = true)
 |-- features: vector (nullable = true)



In [141]:
_predictions = model_lr.transform(train_model)
_predictions = _predictions.withColumn('prob', new_col(f.col('probability')).cast('double'))
_predictions.show(4)

+-------+-------+--------+---+---+--------------------+--------------------+--------------------+--------------------+----------+--------------------+
|user_id|item_id|purchase|sum|hit|                sum2|            features|       rawPrediction|         probability|prediction|                prob|
+-------+-------+--------+---+---+--------------------+--------------------+--------------------+--------------------+----------+--------------------+
| 867363| 100140|       0|  1|  0|3.892565200467107...|[1.0,0.0,3.892565...|[6.16274052479,-6...|[0.99789795538515...|       0.0|0.002102044643834...|
| 867363|   2136|       0|  1|  1|3.892565200467107...|[1.0,1.0,3.892565...|[6.15911077699136...|[0.99789032775509...|       0.0|0.002109672175720334|
| 867363|  60351|       0|  1|  2|3.892565200467107...|[1.0,2.0,3.892565...|[6.15548102919272...|[0.99788267250560...|       0.0|0.002117327414453...|
| 867363|    691|       0|  1|  0|3.892565200467107...|[1.0,0.0,3.892565...|[6.16274052479,-6.

In [142]:
evaluator.evaluate(_predictions)

0.9221262427632121

In [99]:
evaluator.evaluate(_predictions)

0.9152577731771923

In [111]:
_predictions = model_rf.transform(train_model)
_predictions = _predictions.withColumn('prob', new_col(f.col('probability')).cast('double'))
_predictions.show(4)

+-------+-------+--------+---+---+--------------------+--------------------+--------------------+--------------------+----------+--------------------+
|user_id|item_id|purchase|sum|hit|                sum2|            features|       rawPrediction|         probability|prediction|                prob|
+-------+-------+--------+---+---+--------------------+--------------------+--------------------+--------------------+----------+--------------------+
| 867363| 100140|       0|  1|  0|3.892565200467107...|[1.0,0.0,3.892565...|[40.9114447947498...|[0.99784011694511...|       0.0|0.002159883035346...|
| 867363|   2136|       0|  1|  1|3.892565200467107...|[1.0,1.0,3.892565...|[40.9114447947498...|[0.99784011694511...|       0.0|0.002159883035346...|
| 867363|  60351|       0|  1|  2|3.892565200467107...|[1.0,2.0,3.892565...|[40.9114447947498...|[0.99784011694511...|       0.0|0.002159883035346...|
| 867363|    691|       0|  1|  0|3.892565200467107...|[1.0,0.0,3.892565...|[40.9114447947498.

In [112]:
evaluator.evaluate(_predictions)

0.5

In [100]:
test_4_model = assembler2.transform(test_4)
test_4_model.cache()
test_4_model.count()

2156840

In [143]:
predictions_test = model_lr.transform(test_4_model)

In [144]:
predictions_test.show(3)

+-------+-------+--------+---+---+--------------------+--------------------+--------------------+--------------------+----------+
|user_id|item_id|purchase|hit|sum|                sum2|            features|       rawPrediction|         probability|prediction|
+-------+-------+--------+---+---+--------------------+--------------------+--------------------+--------------------+----------+
| 867363|  92649|    null|  1|  1|3.892565200467107...|[1.0,1.0,3.892565...|[6.15911077699136...|[0.99789032775509...|       0.0|
| 867363|  84350|    null|  3|  1|3.892565200467107...|[1.0,3.0,3.892565...|[6.15185128139409...|[0.99787498953708...|       0.0|
| 867363|  88833|    null|  1|  1|3.892565200467107...|[1.0,1.0,3.892565...|[6.15911077699136...|[0.99789032775509...|       0.0|
+-------+-------+--------+---+---+--------------------+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [145]:
predictions_test = predictions_test.withColumn('prob', new_col(f.col('probability')))
predictions_test.show(4)

+-------+-------+--------+---+---+--------------------+--------------------+--------------------+--------------------+----------+------------+
|user_id|item_id|purchase|hit|sum|                sum2|            features|       rawPrediction|         probability|prediction|        prob|
+-------+-------+--------+---+---+--------------------+--------------------+--------------------+--------------------+----------+------------+
| 867363|  92649|    null|  1|  1|3.892565200467107...|[1.0,1.0,3.892565...|[6.15911077699136...|[0.99789032775509...|       0.0|0.0021096722|
| 867363|  84350|    null|  3|  1|3.892565200467107...|[1.0,3.0,3.892565...|[6.15185128139409...|[0.99787498953708...|       0.0|0.0021250104|
| 867363|  88833|    null|  1|  1|3.892565200467107...|[1.0,1.0,3.892565...|[6.15911077699136...|[0.99789032775509...|       0.0|0.0021096722|
| 867363|  74396|    null|  1|  1|3.892565200467107...|[1.0,1.0,3.892565...|[6.15911077699136...|[0.99789032775509...|       0.0|0.0021096722|

In [146]:
test_out = predictions_test.select("user_id","item_id", f.col("prob").alias("purchase"))

In [147]:
test_out.show(2)

+-------+-------+------------+
|user_id|item_id|    purchase|
+-------+-------+------------+
| 867363|  92649|0.0021096722|
| 867363|  84350|0.0021250104|
+-------+-------+------------+
only showing top 2 rows



In [148]:
test_out.count()

2156840

In [149]:
test_out.orderBy('user_id','item_id').toPandas().to_csv('lab03.csv')