In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 8 --executor-memory 4g --executor-cores 2 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "KAM_lab3") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
spark

In [4]:
from pyspark import keyword_only

from pyspark.ml import Transformer, Pipeline
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover, CountVectorizer, VectorAssembler
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.classification import LogisticRegression


from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, DoubleType, FloatType, ArrayType, StringType, IntegerType

from pyspark.sql.window import Window
from pyspark.sql.functions import udf, col, when, isnan, isnull, broadcast, desc, lower, pandas_udf, row_number, explode, split
from pyspark.sql.functions import array, collect_set, lit

from pyspark.mllib.linalg import SparseVector, DenseVector



import json
import re

# train.csv

In [8]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True) \
      .add("purchase", IntegerType(), True)
      
df_user = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_train.csv")

df_user.createOrReplaceTempView("t_train")

In [9]:
spark.sql("""SELECT 999999 as user_id
                , 999999 as item_id
                , 1 as purchase
            UNION ALL 
            SELECT user_id
                , item_id
                , purchase
            FROM t_train
            """).createOrReplaceTempView("train")

In [10]:
df_user_buy = spark.sql(f"""SELECT user_id
            , sum(purchase)/count(1) buy
        FROM train
        GROUP BY user_id""")
df_user_buy.createOrReplaceTempView("user_buy")
df_user_buy.limit(5).toPandas()

Unnamed: 0,user_id,buy
0,851486,0.0
1,901457,0.0
2,927211,0.000392
3,928140,0.000387
4,825061,0.001931


In [11]:
df_item_buy = spark.sql(f"""SELECT item_id
            , sum(purchase)/count(1) buy
        FROM train
        GROUP BY item_id""")
df_item_buy.createOrReplaceTempView("item_buy")
df_item_buy.limit(5).toPandas()

Unnamed: 0,item_id,buy
0,99817,0.0
1,93486,0.002141
2,8389,0.005979
3,72820,0.000739
4,95080,0.0


In [12]:
spark.sql(f"""SELECT avg(buy)
        FROM item_buy""").toPandas()

Unnamed: 0,avg(buy)
0,0.002435


In [13]:
train_features = spark.sql(f"""SELECT tr.user_id
            , tr.item_id
            , coalesce(item_buy.buy, 0.001) as item_buy
            , coalesce(user_buy.buy, 0.001) as user_buy
            , purchase
        FROM train tr
        LEFT JOIN item_buy
            ON tr.item_id = item_buy.item_id
        LEFT JOIN user_buy
            ON tr.user_id = user_buy.user_id        
        """)

In [14]:
read_items_schema = StructType(fields=[
    StructField('item_id', IntegerType()), 
    StructField('channel_id', FloatType()),
    StructField('datetime_availability_start', StringType()),
    StructField('datetime_availability_stop', StringType()),
    StructField('datetime_show_start', StringType()),
    StructField('datetime_show_stop', StringType()),
    StructField('content_type', IntegerType()),
    StructField('title', StringType(), nullable=True),
    StructField('year', FloatType(), nullable=True),
    StructField('genres', StringType()),
    StructField('region_id', IntegerType()),
]) 

df_items = (spark.read.format("csv")
            .option("header", True)
            .option("sep", "\t")
            .schema(read_items_schema)
            .load("/labs/slaba03/laba03_items.csv")
           )
            
    
df_items = (df_items
            .withColumn("year", 
                        when(df_items.item_id == 103377, 2008.0)
                        .when(df_items.item_id == 95141, 2014.0)
                        .when(df_items.item_id == 72544, 2009.0)
                        .when(df_items.item_id == 8544, 1994.0)
                        .otherwise(df_items.year))
            .withColumn("genres", 
                        when(df_items.item_id == 103377, 'Анимация, Короткометражные')
                        .otherwise(df_items.genres))
           )
    
df_items = (df_items
            .filter(~df_items.item_id.isNull())
            .repartition(32)
            .cache())
df_items.createOrReplaceTempView("t_item")

In [15]:
spark.sql("""SELECT item_id, genres
                FROM t_item
            UNION ALL
            SELECT 999999 as item_id, 'нет' as genres
            """).createOrReplaceTempView("item")

In [16]:
df_items = spark.sql(f"""SELECT item_id
            , regexp_replace(coalesce(genres, 'пусто'), "[^\d\w a-zA-Zа-яА-Я]", " ") genres_prep
        FROM item
        """)

In [17]:
from pyspark.ml.feature import Tokenizer, HashingTF
tokenizer = Tokenizer(inputCol="genres_prep", outputCol="genres_words")
df_items = tokenizer.transform(df_items)

In [18]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
stop_words = (StopWordsRemover.loadDefaultStopWords("russian") +
            StopWordsRemover.loadDefaultStopWords("english") + [' ', ''])
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="genres_filtered", stopWords=stop_words)

In [19]:
df_items = swr.transform(df_items)

In [20]:
df_items_genres = df_items.select(df_items.item_id, explode(df_items.genres_words))

In [21]:
df_items_genres.createOrReplaceTempView("items_genres")

In [22]:
mylist = [
    {"general":"general","general_norm":"general"}
    , {"general":"песни","general_norm":"музыка"}
    , {"general":"мюзиклы","general_norm":"музыка"}
    , {"general":"мультсериалы","general_norm":"мультики"}
    , {"general":"кулинария","general_norm":"кулинария"}
    , {"general":"боевики","general_norm":"боевик"}
    , {"general":"семьи","general_norm":"семейные"}
    , {"general":"семейные","general_norm":"семейные"}
    , {"general":"семейный","general_norm":"семейные"}
    , {"general":"аниме","general_norm":"фэнтези"}
    , {"general":"фэнтези","general_norm":"фэнтези"}
    , {"general":"драма","general_norm":"драма"}
    , {"general":"драмы","general_norm":"драма"}
    , {"general":"арт","general_norm":"арт"}
    , {"general":"артхаус","general_norm":"арт"}
    , {"general":"хаус","general_norm":"арт"}
    , {"general":"здоровье","general_norm":"передачи"}
    , {"general":"реалити","general_norm":"передачи"}
    , {"general":"полнометражные","general_norm":"полнометражные"}
    , {"general":"вестерн","general_norm":"вестерн"}
    , {"general":"романтические","general_norm":"роман"}
    , {"general":"музыкальные","general_norm":"музыка"}
    , {"general":"спектакли","general_norm":"спектакли"}
    , {"general":"экранизации","general_norm":"экранизации"}
    , {"general":"боевик","general_norm":"боевик"}
    , {"general":"фильмы","general_norm":"полнометражные"}
    , {"general":"приключение","general_norm":"приключение"}
    , {"general":"документальный","general_norm":"документальный"}
    , {"general":"документальные","general_norm":"документальный"}
    , {"general":"исторический","general_norm":"документальный"}
    , {"general":"научная","general_norm":"документальный"}
    , {"general":"видеоигры","general_norm":"игры"}
    , {"general":"игры","general_norm":"игры"}
    , {"general":"игры","general_norm":"игры"}
    , {"general":"мелодрамы","general_norm":"мелодрамы"}
    , {"general":"мелодрама","general_norm":"мелодрамы"}
    , {"general":"биография","general_norm":"документальный"}
    , {"general":"детские","general_norm":"мультики"}
    , {"general":"маленьких","general_norm":"мультики"}
    , {"general":"союзмультфильм","general_norm":"мультики"}
    , {"general":"мультфильмы","general_norm":"мультики"}
    , {"general":"анимация","general_norm":"мультики"}
    , {"general":"мультфильм","general_norm":"мультики"}
    , {"general":"приключения","general_norm":"приключение"}
    , {"general":"военный","general_norm":"военные"}
    , {"general":"военные","general_norm":"военные"}
    , {"general":"детективы","general_norm":"детективы"}
    , {"general":"развлекательные","general_norm":"передачи"}
    , {"general":"передачи","general_norm":"передачи"}
    , {"general":"познавательные","general_norm":"передачи"}
    , {"general":"знать","general_norm":"передачи"}
    , {"general":"эротика","general_norm":"эротика"}
    , {"general":"мистические","general_norm":"фэнтези"}
    , {"general":"юмористические","general_norm":"комедии"}
    , {"general":"охота","general_norm":"передачи"}
    , {"general":"фантастика","general_norm":"фантастика"}
    , {"general":"прочие","general_norm":"прочие"}
    , {"general":"детей","general_norm":"мультики"}
    , {"general":"музыкальный","general_norm":"музыка"}
    , {"general":"рыбалка","general_norm":"передачи"}
    , {"general":"сказка","general_norm":"мультики"}
    , {"general":"исторические","general_norm":"передачи"}
    , {"general":"спортивные","general_norm":"передачи"}
    , {"general":"пусто","general_norm":"прочие"}
    , {"general":"сказки","general_norm":"мультики"}
    , {"general":"фантастические","general_norm":"фантастика"}
    , {"general":"триллер","general_norm":"триллер"}
    , {"general":"ужасы","general_norm":"ужасы"}
    , {"general":"шоу","general_norm":"передачи"}
    , {"general":"развивающие","general_norm":"передачи"}
    , {"general":"спорт","general_norm":"спорт"}
    , {"general":"комедия","general_norm":"комедии"}
    , {"general":"криминал","general_norm":"боевик"}
    , {"general":"комедии","general_norm":"комедии"}
    , {"general":"животных","general_norm":"передачи"}
    , {"general":"триллеры","general_norm":"триллер"}
    , {"general":"взрослых","general_norm":"эротика"}
    , {"general":"западные","general_norm":"зарубежные"}
    , {"general":"русские","general_norm":"русские"}
    , {"general":"зарубежные","general_norm":"зарубежные"}
    , {"general":"сериалы","general_norm":"короткометражные"}
    , {"general":"короткометражные","general_norm":"короткометражные"}
    , {"general":"короткометражки","general_norm":"короткометражные"}
    , {"general":"наши","general_norm":"русские"}
    , {"general":"советские","general_norm":"русские"}
    , {"general":"кино","general_norm":"полнометражные"}
    , {"general":"советское","general_norm":"русские"}
    , {"general":"нет","general_norm":"нет"}

]
genres_schema = StructType(fields=[
        StructField('general', StringType())
        , StructField('general_norm', StringType())])
#         , StructField('general_seg', StringType())
#         , StructField('general_reg', StringType())
#         , StructField('general_long', StringType())])
df = spark.createDataFrame(mylist, schema=genres_schema)
df.createOrReplaceTempView("genres")
spark.sql(f"""SELECT distinct col
        FROM items_genres 
        left join genres
            on items_genres.col = genres.general
        where genres.general is null
            and col not in ('для', 'про', 'в', 'о', 'вс', 'и', 'всей', 'd', 'самых', 'хочу')
            and length(col) > 1
        --GROUP BY col
        limit 100
        """).toPandas()

Unnamed: 0,col


In [24]:
spark.sql(f"""SELECT distinct general_norm
    from genres
    order by general_norm""").toPandas()

Unnamed: 0,general_norm
0,general
1,арт
2,боевик
3,вестерн
4,военные
5,детективы
6,документальный
7,драма
8,зарубежные
9,игры


In [23]:
items_genres = spark.sql(f"""SELECT DISTINCT item_id, general_norm
        FROM items_genres 
        join genres
            on items_genres.col = genres.general
        where col not in ('для', 'про', 'в', 'о', 'вс', 'и', 'всей', 'd', 'самых', 'хочу')
            and length(col) > 1
        """)

In [24]:
items_genres.createOrReplaceTempView("items_genres_for_user")

In [25]:
items_genres_vect = items_genres.groupBy('item_id').agg(collect_set('general_norm').alias('genres'))

In [26]:
items_genres_vect.show(5)

+-------+--------------------+
|item_id|              genres|
+-------+--------------------+
|   8389| [русские, мультики]|
|   8638|[зарубежные, трил...|
|  10817|[зарубежные, доку...|
|  72820|[драма, зарубежны...|
|  74757|[зарубежные, фэнт...|
+-------+--------------------+
only showing top 5 rows



In [27]:
count_vectorizer = CountVectorizer(inputCol='genres', outputCol="genres_vector", binary=False)
count_vectorizer_model = count_vectorizer.fit(items_genres_vect)
items_genres_vect = count_vectorizer_model.transform(items_genres_vect)

normalizer = Normalizer(inputCol='genres_vector', outputCol="genres_norm")
items_genres_vect = normalizer.transform(items_genres_vect)
items_genres_vect = items_genres_vect.drop("genres", "genres_vector")
items_genres_vect.createOrReplaceTempView("items_gender_vect")

In [28]:
items_genres_vect.createOrReplaceTempView("items_gender_vect")
items_genres_vect.show(5)

+-------+--------------------+
|item_id|         genres_norm|
+-------+--------------------+
|   8389|(32,[2,6],[0.7071...|
|   8638|(32,[1,4,7,10,17]...|
|  10817|(32,[1,19],[0.707...|
|  72820|(32,[1,3,8],[0.57...|
|  74757|(32,[1,10,18],[0....|
+-------+--------------------+
only showing top 5 rows



In [29]:
user_gender = spark.sql(f"""SELECT user_id, general_norm
        FROM train tr
        JOIN items_genres_for_user g
            ON tr.item_id = g.item_id
        WHERE purchase = 1""")

In [30]:
user_gender_vect = user_gender.groupBy('user_id').agg(collect_set('general_norm').alias('genres'))

In [31]:
user_gender_vect = count_vectorizer_model.transform(user_gender_vect)
user_gender_vect = normalizer.transform(user_gender_vect)
user_gender_vect = user_gender_vect.drop("genres", "genres_vector")
user_gender_vect.createOrReplaceTempView("user_gender_vect")

In [32]:
user_gender_vect.show(5)

+-------+--------------------+
|user_id|         genres_norm|
+-------+--------------------+
| 754230|(32,[1,2,3,4,5,6,...|
| 833685|(32,[1,2,4,6,9,14...|
| 879401|(32,[1,2,3,4,5,6,...|
| 776188|(32,[1,3,4,5,7,8,...|
| 825061|(32,[1,2,3,4,5,7,...|
+-------+--------------------+
only showing top 5 rows



In [33]:
user_vector_fin = spark.sql(f"""SELECT user_buy.user_id, buy, genres_norm
        FROM user_buy            
        LEFT JOIN user_gender_vect v
            ON user_buy.user_id = v.user_id
        """)
user_vector_fin.createOrReplaceTempView("user_vector_fin")

In [34]:
items_vector_fin = spark.sql(f"""SELECT item_buy.item_id, buy, genres_norm
        FROM item_buy            
        LEFT JOIN items_gender_vect v
            ON item_buy.item_id = v.item_id
        """)
items_vector_fin.createOrReplaceTempView("items_vector_fin")

In [35]:
train_features = spark.sql(f"""SELECT coalesce(items_vector_fin.buy, 0.001) as item_buy
            , coalesce(user_vector_fin.buy, 0.001) as user_buy
            , items_vector_fin.genres_norm as item_vect
            , user_vector_fin.genres_norm as user_vect
            , purchase
        FROM train tr
        JOIN user_vector_fin
            ON tr.user_id = user_vector_fin.user_id
        JOIN items_vector_fin
            ON tr.item_id = items_vector_fin.item_id
        WHERE items_vector_fin.genres_norm is Not Null
            AND user_vector_fin.genres_norm is Not Null
        """)

In [38]:
spark.sql("""SELECT purchase, count(1)
            FROM train
            GROUP BY purchase""").toPandas()

Unnamed: 0,purchase,count(1)
0,1,10905
1,0,5021720


In [39]:
train_features.limit(5).toPandas()

Unnamed: 0,item_buy,user_buy,item_vect,user_vect,purchase
0,0.005979,0.027576,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ...","(0.0, 0.23570226039551587, 0.23570226039551587...",0
1,0.005979,0.000776,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ...","(0.0, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.5, 0.0, ...",0
2,0.005979,0.000384,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026...",0
3,0.005979,0.001931,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ...","(0.0, 0.2886751345948129, 0.2886751345948129, ...",0
4,0.005979,0.007501,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ...","(0.0, 0.4082482904638631, 0.4082482904638631, ...",0


In [36]:
assembler = VectorAssembler(inputCols=["item_buy", "user_buy", "item_vect", "user_vect"], outputCol="features")
train_features = assembler.transform(train_features)


In [37]:
train_features = train_features.drop("item_buy", "user_buy", "item_vect", "user_vect")
train_features.limit(5).toPandas()

Unnamed: 0,purchase,features
0,0,"(0.005979073243647235, 0.0019402405898331393, ..."
1,0,"(0.005979073243647235, 0.004243827160493827, 0..."
2,0,"(0.005979073243647235, 0.00038880248833592535,..."
3,0,"(0.005979073243647235, 0.009220130618517095, 0..."
4,0,"(0.005979073243647235, 0.003436426116838488, 0..."


In [38]:
trainDF = train_features.sampleBy("purchase", fractions={0: 0.005, 1: 0.8}, seed=5757).coalesce(10).cache()
testDf = train_features.join(trainDF\
        , (train_features.features == trainDF.features)\
        , how="leftanti").coalesce(10).cache()

In [53]:
# 0.8346456692913385 [720, 0.06649297309185655]
import random
lgp = [0.0, 0.0, 0.0]
maxIter = 0
regParam = 0
lgp_save = [[maxIter, regParam], [maxIter, regParam], [maxIter, regParam]]
lMod = [0, 0, 0]
for i in range(100):
    maxIter, regParam = int(random.uniform(1, 1000)), random.uniform(0, 0.1)
    lr = LogisticRegression(featuresCol="features", labelCol="purchase", maxIter=maxIter, regParam=regParam)
    model = lr.fit(trainDF)
    predictions = model.transform(testDf)
    predictions.createOrReplaceTempView("predictions")
    df = spark.sql(f"""SELECT sum(case when purchase = prediction then 1 else 0 end) / count(1) as pr
                , sum(case when (purchase = 0 and purchase = prediction) then 1 else 0 end) / (count(1) - sum(purchase)) as pr0
                , sum(case when (purchase = 1 and purchase = prediction) then 1 else 0 end) / sum(purchase) as pr1                                    
        FROM predictions
        """).toPandas()
    res = [df.iloc[0,0], df.iloc[0,0], df.iloc[0,0]]
    for i in range(3):
        if lgp[i] < res[i]:
            lgp[i] = res[i]
            lgp_save[i] = [maxIter, regParam]
            lMod[i] = model
    print(df.iloc[0,0], df.iloc[0,1], df.iloc[0,2], maxIter, regParam)
print(df.iloc[0,0], df.iloc[0,1], df.iloc[0,2], [maxIter, regParam])

0.9463523099778809 0.9465536547897528 0.5412371134020618 47 0.07831650606993103
0.9426927247324273 0.9428803728618713 0.5651358950328023 714 0.05525195667336894
0.942154529891928 0.9423393486469038 0.57029053420806 404 0.05190801262257007
0.9351221483471692 0.9352808807678584 0.6157450796626054 388 0.012304952739303954
0.9464486822373821 0.9466500749468758 0.5412371134020618 300 0.07886660363914205
0.9461732996697737 0.9463743226134613 0.5417057169634489 446 0.07706009096506941
0.9384358177046549 0.9386069103907392 0.5941893158388004 680 0.03091689729438936
0.939305728655756 0.9394800484802414 0.5885660731021556 586 0.03568653143911351
0.9394710047529681 0.9396456396196481 0.5880974695407685 35 0.0365332495256837
0.9395303644780232 0.9397052617457917 0.5876288659793815 256 0.036873964101161054
0.9442865915459647 0.9444803885126795 0.5543580131208997 343 0.06517733881790237
0.9443527019848496 0.9445467647078003 0.5538894095595126 664 0.06560234078921448
0.9441026927899118 0.944295932559

In [86]:
from pyspark.ml.classification import RandomForestClassifier
lgp = 0.0
maxIter = 0
maxDepth = 5, 
maxBins = 32
lgp_save = [maxIter, regParam]
lMod = None
for i in range(5):
    numTrees, maxDepth, maxBins = int(random.uniform(10, 200)), int(random.uniform(5, 30)), int(random.uniform(32, 64))
    lr = RandomForestClassifier(featuresCol="features", labelCol="purchase", numTrees=numTrees, maxDepth=maxDepth, maxBins=maxBins)
    model = lr.fit(trainDF)
    predictions = model.transform(testDf)
    predictions.createOrReplaceTempView("predictions")
    df = spark.sql(f"""SELECT sum(case when purchase = prediction then 1 else 0 end) / count(1) as pr    
        FROM predictions
        --WHERE purchase = 1
        """).toPandas()
    res = df.iloc[0,0]
    if lgp < res:
        lgp = res
        lgp_save = [numTrees, maxDepth, maxBins]
        lMod_RF = model
    print(df.iloc[0,0], numTrees, maxDepth, maxBins)
print(lgp, [numTrees, maxDepth, maxBins])

0.8719941193785606 129 14 59
0.8563502119427658 11 11 63
0.8756268945793464 89 22 41
0.8776197564053544 159 26 53
0.8705181741819135 180 14 50
0.8776197564053544 [180, 14, 50]


In [72]:
# lMod_GBT  0.8999060150375939 62 10 6
from pyspark.ml.classification import GBTClassifier
lgp = 0.0
maxIter = 0
maxDepth = 5, 
maxBins = 32
lgp_save = [maxIter, regParam]
lMod = None
for i in range(100):
    maxIter, maxDepth, maxBins = int(random.uniform(10, 100)), int(random.uniform(5, 30)), int(random.uniform(32, 30))
    lr = GBTClassifier(featuresCol="features", labelCol="purchase", maxIter=maxIter, maxDepth=maxDepth, maxBins=maxBins)
    model = lr.fit(trainDF)
    predictions = model.transform(testDf)
    predictions.createOrReplaceTempView("predictions")
    df = spark.sql(f"""SELECT sum(case when purchase = prediction then 1 else 0 end) / count(1) as pr    
        FROM predictions
        WHERE purchase = 1
        """).toPandas()
    res = df.iloc[0,0]
    if lgp < res:
        lgp = res
        lgp_save = [maxIter, maxDepth, maxBins]
        lMod_GBT = model
    print(df.iloc[0,0], maxIter, maxDepth, maxBins)
print(lgp, [maxIter, maxDepth, maxBins])

0.8543233082706767 41 15 8
0.8411654135338346 78 27 14


Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/usr/hdp/current/spark2-client/python/pyspark/ml/wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'GBTClassifier' object has no attribute '_java_obj'
Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/usr/hdp/current/spark2-client/python/pyspark/ml/wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'GBTClassifier' object has no attribute '_java_obj'
Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/usr/hdp/current/spark2-client/python/pyspark/ml/wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'GBTClassifier' object has no attribute '_java_obj'


0.8458646616541353 26 29 15
0.856203007518797 91 22 10
0.8604323308270677 32 15 11
0.8999060150375939 62 10 6
0.8383458646616542 60 28 29


KeyboardInterrupt: 

In [43]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True) 
      
      
df_user_test = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_test.csv")
df_user_test.createOrReplaceTempView("test")

In [46]:
spark.sql(f"""SELECT tr.user_id
            , tr.item_id
            , coalesce(items_vector_fin.buy, 0.001) as item_buy
            , coalesce(user_vector_fin.buy, 0.001) as user_buy
            , items_vector_fin.genres_norm as item_vect
            , user_vector_fin.genres_norm as user_vect
        FROM test tr
        LEFT JOIN user_vector_fin
            ON tr.user_id = user_vector_fin.user_id
        LEFT JOIN items_vector_fin
            ON tr.item_id = items_vector_fin.item_id
        WHERE items_vector_fin.genres_norm is null
        """).count()

0

In [56]:
test_prep = spark.sql(f"""SELECT tr.user_id
            , tr.item_id
            , coalesce(items_vector_fin.buy, 0.001) as item_buy
            , coalesce(u1.buy, 0.001) as user_buy
            , items_vector_fin.genres_norm as item_vect
            , u2.genres_norm as user_vect
        FROM test tr
        LEFT JOIN user_vector_fin u1
            ON tr.user_id = u1.user_id
        LEFT JOIN items_vector_fin
            ON tr.item_id = items_vector_fin.item_id
        LEFT JOIN user_vector_fin u2
            ON (tr.user_id != u2.user_id AND u2.user_id = 999999)
        WHERE u1.genres_norm is null
        
        UNION ALL
        
        SELECT tr.user_id
            , tr.item_id
            , coalesce(items_vector_fin.buy, 0.001) as item_buy
            , coalesce(u1.buy, 0.001) as user_buy
            , items_vector_fin.genres_norm as item_vect
            , u1.genres_norm as user_vect
        FROM test tr
        LEFT JOIN user_vector_fin u1
            ON tr.user_id = u1.user_id
        LEFT JOIN items_vector_fin
            ON tr.item_id = items_vector_fin.item_id
        WHERE u1.genres_norm is NOT null
        """)

In [61]:
test_prep = assembler.transform(test_prep)

In [62]:
test_prep = test_prep.drop("item_buy", "user_buy", "item_vect", "user_vect")

In [89]:
predictions = lMod_GBT.transform(test_prep)

In [87]:
predictions = lMod_RF.transform(test_prep)

In [76]:
type(lMod_GBT)

pyspark.ml.classification.GBTClassificationModel

In [64]:
predictions.show(1)

+-------+-------+--------------------+--------------------+--------------------+----------+
|user_id|item_id|            features|       rawPrediction|         probability|prediction|
+-------+-------+--------------------+--------------------+--------------------+----------+
| 816426|   8389|(66,[0,4,8,64],[0...|[3.34786493254183...|[0.96603485069953...|       0.0|
+-------+-------+--------------------+--------------------+--------------------+----------+
only showing top 1 row



In [90]:
df = predictions\
        .sort("user_id", "item_id")\
        .select("user_id", "item_id", predictions.prediction.alias("purchase"))\
        .toPandas()
df.to_csv('lab03.csv')

In [60]:
predictions = lMod.transform(test_prep)

IllegalArgumentException: 'Field "features" does not exist.\nAvailable fields: user_id, item_id, item_buy, user_buy, item_vect, user_vect'

In [49]:
spark.sql("""SELECT *
            FROM user_vector_fin
            WHERE user_id = 999999 """).toPandas()

Unnamed: 0,user_id,buy,genres_norm
0,999999,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [168]:
spark.sql("""SELECT item_vect
        FROM test_prep
        limit 10
        """).toPandas()

Unnamed: 0,item_vect
0,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ..."
1,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ..."
2,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ..."
3,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ..."
4,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ..."
5,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ..."
6,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ..."
7,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ..."
8,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ..."
9,"(0.0, 0.0, 0.7071067811865475, 0.0, 0.0, 0.0, ..."


In [171]:
spark.sql("""SELECT item_vect
        FROM test_prep
        limit 1
        """).show(1, False, True)

-RECORD 0-------------------------------------------------------
 item_vect | (31,[2,6],[0.7071067811865475,0.7071067811865475]) 



In [165]:
spark.sql("""SELECT DISTINCT user_id, item_id
        FROM test_prep
        WHERE item_buy is null
            or user_buy is null
            or item_vect is null
            or user_vect is null
        """).count()

295038

In [167]:
spark.sql("""SELECT DISTINCT user_id
        FROM test_prep
        WHERE item_buy is null
            or user_buy is null
            or item_vect is null
            or user_vect is null
        """).count()

266

In [166]:
test_prep.count()

2156840

In [152]:
lr = LogisticRegression(featuresCol="features", labelCol="purchase", maxIter=40, regParam=0.1)
model = lr.fit(trainDF)

In [None]:
# Расчет категории для года выпуска материала
df_items = df_items.withColumn("year_cat", array((((df_items.year - 1900.0)/10) + 1).cast(IntegerType()).cast(StringType())))\
        .withColumn("year_cat_str", (((df_items.year - 1900.0)/10) + 1).cast(IntegerType()).cast(StringType()))

count_vectorizer_year = CountVectorizer(inputCol='year_cat', outputCol="year_cat_vector", binary=False)
count_vectorizer_year_model = count_vectorizer_year.fit(df_items)
df_items = count_vectorizer_year_model.transform(df_items)


normalizer_year = Normalizer(inputCol='year_cat_vector', outputCol="year_cat_norm")
df_items = normalizer_year.transform(df_items)
df_items = df_items.drop("year_cat_vector", "region_id", "channel_id", "datetime_show_start", "datetime_show_stop")

In [None]:
from pyspark.ml.feature import VectorAssembler

train_features = VectorAssembler(inputCols=["item_buy", "user_buy"], outputCol="features").transform(train_features)

In [None]:
train_features.limit(10).show()

In [None]:
# normalizer_year = Normalizer(inputCol='features', outputCol="train_features_norm")
# train_features = normalizer_year.transform(train_features)

In [None]:
train_features.select("purchase", "features")\
    .limit(10).show(2, False, True)

In [None]:
trainDF = train_features.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=5757)
testDf = train_features.join(trainDF\
        , (train_features.user_id == trainDF.user_id) & (train_features.item_id == trainDF.item_id)\
        , how="leftanti").coalesce(10).cache()

In [None]:
f'{trainDF.count()} {testDf.count()} {trainDF.count() + testDf.count()} {train_features.count()}'

In [None]:
import random
lgp = 0.0
lgp_save = [maxIter, regParam]
for i in range(100):
    maxIter, regParam = int(random.uniform(10, 1000)), random.uniform(0, 1)
    lr = LogisticRegression(featuresCol="features", labelCol="purchase", maxIter=maxIter, regParam=regParam)
    model = lr.fit(trainDF)
    predictions = model.transform(testDf)
    predictions.createOrReplaceTempView("predictions")
    df = spark.sql(f"""SELECT sum(case when purchase = prediction then 1 else 0 end) / count(1) as pr    
        FROM predictions
        WHERE purchase = 1
        """).toPandas()
    res = df.iloc[0,0]
    if lgp < res:
        lgp = res
        lgp_save = [maxIter, regParam]
    print(df.iloc[0,0], maxIter, regParam)
print(lgp, [maxIter, regParam])

In [None]:
for i in range(100):
    maxIter, regParam = int(random.uniform(10, 100)), random.uniform(0, 1)
    lr = LogisticRegression(featuresCol="features", labelCol="purchase", maxIter=maxIter, regParam=regParam)
    model = lr.fit(trainDF)
    predictions = model.transform(testDf)
    predictions.createOrReplaceTempView("predictions")
    df = spark.sql(f"""SELECT sum(case when purchase = prediction then 1 else 0 end) / count(1) as pr    
        FROM predictions
        WHERE purchase = 1
        """).toPandas()
    res = df.iloc[0,0]
    if lgp < res:
        lgp = res
        lgp_save = [maxIter, regParam]
    print(df.iloc[0,0], maxIter, regParam)
print(lgp, [maxIter, regParam])

In [None]:
for i in range(100):
    maxIter, regParam = int(random.uniform(10, 100)), random.uniform(0, 0.1)
    lr = LogisticRegression(featuresCol="features", labelCol="purchase", maxIter=maxIter, regParam=regParam)
    model = lr.fit(trainDF)
    predictions = model.transform(testDf)
    predictions.createOrReplaceTempView("predictions")
    df = spark.sql(f"""SELECT sum(case when purchase = prediction then 1 else 0 end) / count(1) as pr    
        FROM predictions
        WHERE purchase = 1
        """).toPandas()
    res = df.iloc[0,0]
    if lgp < res:
        lgp = res
        lgp_save = [maxIter, regParam]
    print(df.iloc[0,0], maxIter, regParam)
print(lgp, [maxIter, regParam])

In [None]:
0.005968778696051423 [29, 0.020436539365475917]maxIter=maxIter, regParam=regParam

In [None]:
df = spark.sql(f"""SELECT sum(case when purchase = prediction then 1 else 0 end) / count(1) as pr    
    FROM predictions
    WHERE purchase = 1
    """).toPandas()
print(df.iloc[0,0])

In [None]:
predictions = model.transform(testDf)
predictions.createOrReplaceTempView("predictions")
df = spark.sql(f"""SELECT purchase, prediction, sum(case when prediction = prediction then 1 else 0 end) as pr
        FROM predictions
        GROUP BY purchase, prediction
        limit 10
        """).toPandas()
print(df)

In [None]:
predictions.limit(10).toPandas()

In [None]:
tasks

In [None]:
from sklearn.model_selection import KFold
numFolds = 3 # more (10 or so) in practice
kf = KFold(n_splits=numFolds)
alphas = [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
tasks = []
for alpha in alphas:
    for fold in range(numFolds):
        tasks = tasks + [(alpha, fold)]

In [None]:
sc = spark.sparkContext
tasksRDD = sc.parallelize(tasks, numSlices = len(tasks))
tasksRDD.getNumPartitions()

In [None]:
trainingFeaturesBroadcast = sc.broadcast(trainDF)
trainingLabelsBroadcast = sc.broadcast(testDf)

In [None]:
train.limit(10).toPandas()

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="purchase", maxIter=10, regParam=0.01)
model = lr.fit(train)

In [None]:
predictionsDf = model.transform(testDf)
predictionsDf.registerTempTable('Predictions')

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="purchase", maxIter=15)

In [None]:
model = lr.fit(train_features)

In [None]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True) 
      
      
df_user_test = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_test.csv")
df_user_test.createOrReplaceTempView("test")

In [None]:
df_test = spark.sql(f"""SELECT test.user_id
            , test.item_id
            , coalesce(item_buy.buy, 0.002166) as item_buy
            , coalesce(user_buy.buy, 0.002165) as user_buy
        FROM test
        LEFT JOIN item_buy
            ON test.item_id = item_buy.item_id
        LEFT JOIN user_buy
            ON test.user_id = user_buy.user_id        
        """)

In [None]:
df_test = VectorAssembler(inputCols=["item_buy", "user_buy"], outputCol="features").transform(df_test)
# df_test = normalizer_year.transform(df_test)

In [None]:
predictions = model.transform(df_test)

In [None]:
predictions.limit(10).toPandas()

In [None]:
predictions

In [None]:
predictions.groupby('prediction').count().limit(5).toPandas()

In [None]:
predictions.groupby('prediction').count().limit(5).toPandas()

In [None]:
df = predictions\
        .sort("user_id", "item_id")\
        .select("user_id", "item_id", predictions.prediction.alias("purchase"))\
        .toPandas()

In [None]:
df.to_csv('lab03.csv')

In [None]:
from pyspark.ml.feature import VectorAssembler

VectorAssembler(inputCols=["buy"], outputCol="features").transform(df_user_buy)

In [None]:
df_user_buy.select((1.0, Vectors.dense([0.0, 1.1, 0.1]))).limit(10).show()

In [None]:
(1.0, Vectors.dense([0.0, 1.1, 0.1]))

In [None]:
# 8544
# df_user.filter((df_user.item_id == 8544) & (df_user.purchase == 1)).limit(100).toPandas()
# 824754
item_ids = df_user.filter((df_user.user_id == 824754) & (df_user.purchase == 1)).collect()
item_ids = [x[1] for x in item_ids]
item_ids

# test.csv

In [None]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True) 
      
      
df_user_test = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_test.csv")


In [None]:
# df_user_test.select(df_user_test.user_id).distinct().count()
df_user_test.select(df_user_test.item_id).distinct().count()
# df_user_test.count()

In [None]:
df_user_test.where(df_user_test.item_id.isin(ids)).toPandas()

# views_programmes.csv

In [None]:
read_users_schema = StructType(fields=[
    StructField('user_id', IntegerType()), 
    StructField('item_id', IntegerType()),
    StructField('ts_start', IntegerType()),
    StructField('ts_end', IntegerType()),
    StructField('item_type', StringType()),
]) 

df_views_programmes = spark.read.format("csv") \
      .option("header", True) \
      .schema(read_users_schema) \
      .load("/labs/slaba03/laba03_views_programmes.csv")

In [None]:
df_views_programmes.show(5, False, False)

# items.csv

In [None]:
read_items_schema = StructType(fields=[
    StructField('item_id', IntegerType()), 
    StructField('channel_id', FloatType()),
    StructField('datetime_availability_start', StringType()),
    StructField('datetime_availability_stop', StringType()),
    StructField('datetime_show_start', StringType()),
    StructField('datetime_show_stop', StringType()),
    StructField('content_type', IntegerType()),
    StructField('title', StringType(), nullable=True),
    StructField('year', FloatType(), nullable=True),
    StructField('genres', StringType()),
    StructField('region_id', IntegerType()),
]) 

df_items = (spark.read.format("csv")
            .option("header", True)
            .option("sep", "\t")
            .schema(read_items_schema)
            .load("/labs/slaba03/laba03_items.csv")
#             .filter("content_type == 1 and datetime_availability_stop == '2099-12-31T21:00:00Z'")
#             .drop('channel_id', 'content_type', 'region_id', 
#                   'datetime_availability_start', 
#                   'datetime_show_start', 'datetime_show_stop')
           )
            
    
df_items = (df_items
            .withColumn("year", 
                        when(df_items.item_id == 103377, 2008.0)
                        .when(df_items.item_id == 95141, 2014.0)
                        .when(df_items.item_id == 72544, 2009.0)
                        .when(df_items.item_id == 8544, 1994.0)
                        .otherwise(df_items.year))
            .withColumn("genres", 
                        when(df_items.item_id == 103377, 'Анимация, Короткометражные')
                        .otherwise(df_items.genres))
           )

#.withColumn(df_items.genres, 'Анимация,Короткометражные'))
    
df_items = (df_items
#             .filter(~df_items.genres.contains('Прочие'))
            .filter(~df_items.item_id.isNull())
            .repartition(16)
            .cache())

In [None]:
# Расчет категории для года выпуска материала
df_items = df_items.withColumn("year_cat", array((((df_items.year - 1910.0)/10) + 1).cast(IntegerType()).cast(StringType())))\
        .withColumn("year_cat_str", (((df_items.year - 1910.0)/10) + 1).cast(IntegerType()).cast(StringType()))

count_vectorizer_year = CountVectorizer(inputCol='year_cat', outputCol="year_cat_vector", binary=False)
count_vectorizer_year_model = count_vectorizer_year.fit(df_items)
df_items = count_vectorizer_year_model.transform(df_items)


normalizer_year = Normalizer(inputCol='year_cat_vector', outputCol="year_cat_norm")
df_items = normalizer_year.transform(df_items)
df_items = df_items.drop("year_cat_vector", "region_id", "channel_id", "datetime_show_start", "datetime_show_stop")

In [None]:
df_items.limit(5).toPandas()

In [None]:
# расчет фичи жанра для контента

def replace_genres(s):
    replace_map = {
        'Арт-хаус': 'Артхаус',
        'Боевики': 'Боевик',
        'Военные': 'Военный',
        'Военные': 'Военный',
        'Детские': 'Детский',
        'Для детей': 'Детский',
        'Для самых маленьких': 'Детский',
        'Для всей семьи': 'Семейные',
        'Для взрослых': 'Эротика',
        'Документальные': 'Документальный',
        'Драмы': 'Драма',
        'Западные мультфильмы': 'Зарубежные,Анимация',
        'Исторические': 'Исторический',
        'Короткометражки': 'Короткометражные',
        'Детский песни': 'Детский,Музыкальные',
        'Мультфильмы в 3D': 'Анимация',
        'Мультфильмы': 'Анимация',
        'Мультсериалы': 'Анимация,Сериалы',
        'Мюзиклы': 'Музыкальные',
        'Русские мультфильмы': 'Анимация,Русские',
        'Аниме': 'Анимация',
        'Спорт': 'Спортивные',
        'Спортивныеивные': 'Спортивные',
        'Наши': 'Русские',
        'Фильмы в 3D': 'Фильмы',
        'Юмористические': 'Юмористические,Передачи',
        'Кулинария': 'Передачи',
        'Игры': 'Передачи',
        'О здоровье': 'Передачи',
        'Охота и рыбалка': 'Передачи',
        'Реалити-шоу': 'Передачи',
        'Видеоигры': 'Видеоигры,Передачи',
        'Фильмы-спектакли': 'Музыкальные,Фильмы',
        'Познавательные': 'Развивающие,Передачи',
        'Хочу всё знать': 'Развивающие,Передачи',
        'Фантастические': 'Фантастика',
        'Фэнтези': 'Фантастика',
        'Союзмультфильм': 'Союзмультфильм,Анимация',
        'Юмористические': 'Комедии',
        'Развлекательные': 'Комедии',
        'Вестерн': 'Фильмы,Зарубежные,Боевик',
        'Советское кино': 'Советские,Фильмы',        
    }
    for key in replace_map:
        s = str(s).replace(key, replace_map[key])
        
    return s.lower().split(',')

replace_genres_udf = udf(replace_genres, ArrayType(StringType()))

#df_items_m2 = df_items_m.select(replace_genres_udf("genres").alias("genres"))
df_items = df_items.withColumn("genres2", replace_genres_udf("genres"))

#df_items_m2.limit(100).toPandas()

#df_items_m2 = df_items_m2.select(split("genres", ',').alias('genres'))
#df_items_m3 = df_items_m2.select(explode('genres').alias('genres'))
#df_items_m3.groupby('genres').count().sort('genres').show(100)

count_vectorizer = CountVectorizer(inputCol='genres2', outputCol="genres_vector", binary=False)
count_vectorizer_model = count_vectorizer.fit(df_items)
df_items = count_vectorizer_model.transform(df_items)


normalizer = Normalizer(inputCol='genres_vector', outputCol="genres_norm")
df_items = normalizer.transform(df_items)
df_items = df_items.drop('genres2')

df_items.limit(5).toPandas()

In [None]:
assembler = VectorAssembler(inputCols=["year_cat_norm", "genres_norm"], outputCol="features")
df_items = assembler.transform(df_items)
df_items.limit(5).toPandas()

In [None]:
df_items.select("item_id", "genres").take(5)

In [None]:
df_user.select("user_id", "item_id", "purchase").take(5)

In [None]:
df_user_genres = df_user.filter(df_user.purchase == 1)\
        .join(df_items, df_user.item_id == df_items.item_id, 'inner')\
        .select(df_user.user_id, df_items.genres.alias("genres"))

df_user_genres = df_user_genres.withColumn("genres2", replace_genres_udf("genres"))
df_user_genres = df_user_genres.select(df_user_genres.user_id, explode(df_user_genres.genres2))
df_user_genres = df_user_genres.groupBy('user_id').agg(collect_set('col').alias('genres2'))

df_user_genres = count_vectorizer_model.transform(df_user_genres)
df_user_genres = normalizer.transform(df_user_genres)
df_user_genres = df_user_genres.drop("genres2", "genres_vector")

# , df_items.year_cat

In [None]:
df_user_genres.limit(5).toPandas()

In [None]:
df_items.select("item_id", "year_cat_str").take(5)

In [None]:
df_user_year = df_user.filter(df_user.purchase == 1)\
        .join(df_items, df_user.item_id == df_items.item_id, 'inner')\
        .select(df_user.user_id, df_items.year_cat_str.alias("year_cat_str"))
df_user_year = df_user_year.groupBy('user_id').agg(collect_set('year_cat_str').alias('year_cat'))
df_user_year = count_vectorizer_year_model.transform(df_user_year)
df_user_year = normalizer_year.transform(df_user_year)
df_user_year = df_user_year.drop("year_cat", "year_cat_vector")

In [None]:
df_user_year.limit(5).toPandas()

In [None]:
df_users_norm = df_user_genres\
        .join(df_user_year, df_user_genres.user_id == df_user_year.user_id, 'inner')\
        .select(df_user_genres.user_id, df_user_year.year_cat_norm, df_user_genres.genres_norm)

In [None]:
df_users_norm.limit(5).toPandas()

In [None]:
assembler = VectorAssembler(inputCols=["year_cat_norm", "genres_norm"], outputCol="features")
df_users_features = assembler.transform(df_users_norm)

In [None]:
df_users_features.show(1)

In [None]:
# расчет фичи жанра для клиента

#df_items_m2.schema
# df_items_m2.groupby().sum().limit(10).show()
#d = df_items_m2.limit(2).collect()
#df_items_m2.groupby().agg()
#SparseVector(DenseVector(d[0]['genres_vector']) + DenseVector(d[1]['genres_vector']))

@udf(ArrayType(FloatType()))
def toDense(v):
    return DenseVector(v)
    #new_array = list([float(x) for x in v])
    #return new_array 

df_items_m2.withColumn("genres_vector2", toDense(df_items_m2.genres_vector)).groupby().sum()

# собираем набор для обучения

In [None]:
df_train = df_user\
        .join(df_users_features, df_user.user_id == df_users_features.user_id, 'inner')\
        .join(df_items, df_user.item_id == df_items.item_id, 'inner')\
        .select(df_users_features.features.alias('user_features'),\
               df_items.features.alias('item_features'),\
               df_user.purchase.alias('target'))

In [None]:
#df_user.limit(5).toPandas()
#df_items.limit(5).toPandas()
df_train.limit(5).toPandas()

# Считаем косинусы близости

In [None]:
# cosine similarity
dot_udf = udf(lambda x,y: float(x.dot(y)), DoubleType())

df_train = df_train.select(\
                dot_udf(df_train.user_year_cat_norm, df_train.item_year_cat_norm).alias("similarity_year_cat"),
                dot_udf(df_train.user_genres_norm, df_train.item_genres_norm).alias("similarity_genres"),\
               df_train.target)

df_train.limit(5).toPandas()

In [None]:
#assembler = VectorAssembler(inputCols=["similarity_year_cat", "similarity_genres"], outputCol="features")
assembler = VectorAssembler(inputCols=["user_features", "item_features"], outputCol="features")
df_train = assembler.transform(df_train)

In [None]:
df_train.limit(10).toPandas()

In [None]:
df_train

In [None]:
#df_train.groupby(df_train.similarity_year_cat).count().limit(20).toPandas()

lr = LogisticRegression(featuresCol='features', labelCol="target", maxIter=15)
lr_model = lr.fit(df_train)

In [None]:
lr_model

# Строим набор для тестирования

In [None]:
print(df_user_test.count())
df_user_test.limit(5).toPandas()

In [None]:
df_user_test = df_user_test\
        .join(df_users_features, df_user_test.user_id == df_users_features.user_id, 'inner')\
        .join(df_items, df_user_test.item_id == df_items.item_id, 'inner')\
        .select(df_users_features.features.alias('user_features'),\
               df_items.features.alias('item_features'),
               df_user_test.user_id, df_user_test.item_id)

In [None]:
print(df_user_test.count())
df_user_test.limit(5).toPandas()

In [None]:
df_user_test = df_user_test.select(\
                dot_udf(df_user_test.user_year_cat_norm, df_user_test.item_year_cat_norm).alias("similarity_year_cat"),
                dot_udf(df_user_test.user_genres_norm, df_user_test.item_genres_norm).alias("similarity_genres"),
                df_user_test.user_id, df_user_test.item_id)

df_user_test.limit(5).toPandas()

In [None]:
assembler = VectorAssembler(inputCols=["user_features", "item_features"], outputCol="features")
df_user_test = assembler.transform(df_user_test)

In [None]:
df_user_test.limit(5).toPandas()

In [None]:
predictions = lr_model.transform(df_user_test)

In [None]:
predictions.limit(10).toPandas()

In [None]:
predictions.groupby('prediction').count().limit(50).toPandas()

In [None]:
df_user_test.join(df_users_norm, df_user_test.user_id == df_users_norm.user_id, 'inner')\

In [None]:
df = predictions\
        .sort("user_id", "item_id")\
        .select("user_id", "item_id", predictions.prediction.alias("purchase"))\
        .toPandas()

In [None]:
df = df_user_test\
        .sort("user_id", "item_id")\
        .select("user_id", "item_id", lit(0.5).alias('purchase'))\
        .toPandas()

In [None]:
df

In [None]:
df.to_csv('lab03.csv')

In [None]:
#ids = df_items.filter(df_items.year.isNull()).collect()
#ids = [x[0] for x in ids]
#ids # [103377, 8544, 95141, 72544]
#df_items.filter(df_items.item_id.isin(ids)).toPandas()


#df_items.filter(df_items.item_id.isin(item_ids)).toPandas()

#df_items.filter(df_items.year.isNull()).toPandas()

#df_items.groupby(df_items.year).count().toPandas()
#df_items.filter(df_items.item_id == 103377).show()

In [None]:
# df_items.limit(100).toPandas()
# df_items.sort(df_items.item_id.desc()).limit(100).toPandas()
# df_items.filter((df_items.item_id.isNull()) | (~df_items.title.isNull())).limit(100).toPandas()
# df_items.filter(df_items.title.isNull()).limit(100).toPandas()

In [None]:
replace_genres_udf = udf(replace_genres, StringType())

#df_items_m = df_items.select('genres')

df_items_m2 = df_items_m.select(replace_genres_udf("genres").alias("genres"))

df_items_m2 = df_items_m2.select(split("genres", ',').alias('genres'))
df_items_m3 = df_items_m2.select(explode('genres').alias('genres'))
df_items_m3.groupby('genres').count().sort('genres').show(100)

In [None]:
df_items.filter("datetime_availability_stop == '2018-12-31T00:00:00Z'").show(5, False, False)

In [None]:
# for col in ['channel_id', 'content_type', 'genres', 'region_id']:
# for col in ['datetime_availability_start','datetime_availability_stop','datetime_show_start','datetime_show_stop']:
for col in ['year']:
    t = df_items.select(col).distinct()
    print(t.count())
    t.show(5, False, False)
    

In [None]:
items_df.rdd.getNumPartitions()

In [None]:
spark.stop()