In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
sc.getConf().get("spark.executor.instances")

'3'

1. Feature Engineering
2. Feature correlation
-  Feature Importance
3. Model Evaluation
4. HyperParameters tuning

**Load Data**

In [3]:
train = spark.read.csv("/labs/slaba03/laba03_train.csv", header=True)
print('train size: {}'. format(train.count()))

test = spark.read.csv("/labs/slaba03/laba03_test.csv", header=True)
print('test size: {}'. format(test.count()))

items = spark.read.options(delimiter='\t').csv("/labs/slaba03/laba03_items.csv", header=True)
print('items size: {}'. format(items.count()))

views = spark.read.csv("/labs/slaba03/laba03_views_programmes.csv", header=True)
print('views size: {}'. format(views.count()))

# merge to one dataframe (train+test)
df_all = train.union(test)

train size: 5032624
test size: 2156840
items size: 635568
views size: 20845607


**Join**

In [5]:
tmp = df_all.join(items, on=['item_id'], how='left').cache()
df_all = tmp.join(views, on=['item_id', 'user_id'], how='left').cache()
del tmp

from pyspark.sql.types import IntegerType, FloatType
df_all = df_all.withColumn("purchase", df_all["purchase"].cast(IntegerType()))
df_all = df_all.withColumn("year", df_all["year"].cast(IntegerType()))

df_all = df_all.select(['item_id', 'user_id', 'purchase', 'year', 'genres'])

- Доля покупок конкретного фильма
- Доля покупок у конкретного пользователя
- Доля покупок пользователя по категориям
- Доля покупок пользователя по годам
- категориальные признаки из жанров 1,2,3
- Доля покупок пользователя по категориям - для genre 1, 2, 3
- Доля покупок пользователя по годам - разбить фильмы по бинам в годах
- Посчитать статистики для бинов по возрасту фильма
- флаг 0,1 - покупал ли этот пользователь хотя бы раз

Split Genres

In [8]:
from pyspark.sql.functions import col, split 

df_all = df_all.withColumn("genre_1", split(col("genres"), ",")\
                     .getItem(0))
df_all = df_all.withColumn("genre_2", split(col("genres"), ",")\
                     .getItem(1))
df_all = df_all.withColumn("genre_3", split(col("genres"), ",")\
                     .getItem(2))

Separate target 0/1

In [10]:
df_all_1 = df_all.filter(df_all.purchase == '1')
df_all_0 = df_all.filter(df_all.purchase == '0')

Доля покупок конкретного фильма

In [11]:
film_sales = df_all_1.groupBy("item_id")\
                .count()\
                .withColumnRenamed("count", "sale_count")\
                .orderBy("sale_count", ascending=False)

film_shows = df_all_0.groupBy("item_id")\
                .count()\
                .withColumnRenamed("count", "show_count")\
                .orderBy("show_count", ascending=False)

films_voronka = film_sales.join(film_shows, on=['item_id'], how='left').cache()

films_voronka = films_voronka.withColumn('film_percent_sales', \
                         films_voronka['sale_count']  / films_voronka['show_count'] )\
                        .select(['item_id', 'film_percent_sales'])

df_all = df_all.join(films_voronka, on=['item_id'], how='left').cache()

Доля покупок у конкретного пользователя

In [15]:
person_sales = df_all_1.groupBy("user_id")\
                .count()\
                .withColumnRenamed("count", "sale_count")\
                .orderBy("sale_count", ascending=False)

person_shows = df_all_0.groupBy("user_id")\
                .count()\
                .withColumnRenamed("count", "show_count")\
                .orderBy("show_count", ascending=False)

person_voronka = person_sales.join(person_shows, on=['user_id'], how='left').cache()

person_voronka = person_voronka.withColumn('user_percent_sales', \
                         person_voronka['sale_count']  / person_voronka['show_count'] )\
                        .select(['user_id', 'user_percent_sales'])

df_all = df_all.join(person_voronka, on=['user_id'], how='left').cache()

df_all = df_all.select(['user_id', 'item_id', 'purchase', 'year', 'genres', 'film_percent_sales', 'user_percent_sales',
               'genre_1', 'genre_2', 'genre_3'])

**Доля покупок фильмов пользователем по годам выпуска**

In [20]:
from pyspark.sql.functions import count, avg

temp_0 = df_all_0.groupBy("user_id", "year").agg(count("*")).alias('cnt_year_0')
temp_0 = temp_0.select(col("user_id"), col("year"), col('count(1)').alias('year_cnt_0'))

temp_1 = df_all_1.groupBy("user_id", "year").agg(count("*")).alias('cnt_year_1')
temp_1 = temp_1.select(col("user_id"), col("year"), col('count(1)').alias('year_cnt_1'))

film_users_voronka = temp_0.join(temp_1, on=['user_id', 'year'], how='left').cache()

film_users_voronka = film_users_voronka.withColumn('user_percent_years', \
                         film_users_voronka['year_cnt_1']  / film_users_voronka['year_cnt_0'] )\
                        .select(['user_id', 'year', 'user_percent_years'])

df_all = df_all.join(film_users_voronka, on=['user_id', 'year'], how='left').cache()

del temp_0, temp_1

**Доля покупок фильмов пользователем в зависимости от жанра**

In [26]:
temp_0 = df_all_0.groupBy("user_id", "genre_1").agg(count("*")).alias('cnt_genre_0')
temp_0 = temp_0.select(col("user_id"), col("genre_1"), col('count(1)').alias('year_cnt_0'))

temp_1 = df_all_1.groupBy("user_id", "genre_1").agg(count("*")).alias('cnt_genre_1')
temp_1 = temp_1.select(col("user_id"), col("genre_1"), col('count(1)').alias('year_genre_1'))

genres_users_voronka = temp_0.join(temp_1, on=['user_id', 'genre_1'], how='left').cache()

genres_users_voronka = genres_users_voronka.withColumn('user_percent_genres', \
                         genres_users_voronka['year_genre_1']  / genres_users_voronka['year_cnt_0'] )\
                        .select(['user_id', 'genre_1', 'user_percent_genres'])

df_all = df_all.join(genres_users_voronka, on=['user_id', 'genre_1'], how='left').cache()

del temp_0, temp_1

---

In [31]:
temp_0 = df_all_0.groupBy("user_id", "genre_2").agg(count("*")).alias('cnt_genre_0')
temp_0 = temp_0.select(col("user_id"), col("genre_2"), col('count(1)').alias('cnt_genre_0'))

temp_1 = df_all_1.groupBy("user_id", "genre_2").agg(count("*")).alias('cnt_genre_2')
temp_1 = temp_1.select(col("user_id"), col("genre_2"), col('count(1)').alias('cnt_genre_1'))

genres_users_voronka = temp_0.join(temp_1, on=['user_id', 'genre_2'], how='left').cache()

genres_users_voronka_2 = genres_users_voronka.withColumn('user_percent_genres_2', \
                         genres_users_voronka['cnt_genre_0']  / genres_users_voronka['cnt_genre_1'] )\
                        .select(['user_id', 'genre_2', 'user_percent_genres_2'])

df_all = df_all.join(genres_users_voronka_2, on=['user_id', 'genre_2'], how='left').cache()

del temp_0, temp_1

---

In [36]:
temp_0 = df_all_0.groupBy("user_id", "genre_3").agg(count("*")).alias('cnt_genre_0')
temp_0 = temp_0.select(col("user_id"), col("genre_3"), col('count(1)').alias('cnt_genre_0'))

temp_1 = df_all_1.groupBy("user_id", "genre_3").agg(count("*")).alias('cnt_genre_1')
temp_1 = temp_1.select(col("user_id"), col("genre_3"), col('count(1)').alias('cnt_genre_1'))

genres_users_voronka = temp_0.join(temp_1, on=['user_id', 'genre_3'], how='left').cache()

genres_users_voronka_3 = genres_users_voronka.withColumn('user_percent_genres_3', \
                         genres_users_voronka['cnt_genre_0']  / genres_users_voronka['cnt_genre_1'] )\
                        .select(['user_id', 'genre_3', 'user_percent_genres_3'])

df_all = df_all.join(genres_users_voronka_3, on=['user_id', 'genre_3'], how='left').cache()

del temp_0, temp_1

Create Bins

In [41]:
from pyspark.sql import functions as F
df_all = df_all.withColumn('year_bin', F.when(col('year') <= 1950, 'old').\
                  when((col('year') > 1950) & (col('year') <= 1970), 'old_middle').\
                  when((col('year') > 1970) & (col('year') <= 1990), 'middle').\
                  when((col('year') >1990) & (col('year') <= 2000), 'middle_new').\
                  when((col('year') > 2000) & (col('year') <= 2010), 'new').\
                  when(col('year') > 2010, 'newest')\
                      .otherwise('hz'))#.select(['year', 'year_bin']).show(10)

**Статистика покупок фильмов по годам**

In [42]:
film_type = 'old'
df_all_1 = df_all.filter((df_all.purchase == '1') & (df_all.year_bin == film_type))
df_all_0 = df_all.filter((df_all.purchase == '0') & (df_all.year_bin == film_type))

temp_0 = df_all_0.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_0')
temp_0 = temp_0.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_0'))

temp_1 = df_all_1.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_1')
temp_1 = temp_1.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_1'))

film_bin_stat = temp_0.join(temp_1, on=['user_id', 'year_bin'], how='left').cache()

film_bin_stat_1 = film_bin_stat.withColumn('film_bin_stat_1', \
                         film_bin_stat['cnt_1']  / film_bin_stat['cnt_0'] )\
                        .select(['user_id', 'year_bin', 'film_bin_stat_1'])

del temp_0, temp_1

film_type = 'old_middle'
df_all_1 = df_all.filter((df_all.purchase == '1') & (df_all.year_bin == film_type))
df_all_0 = df_all.filter((df_all.purchase == '0') & (df_all.year_bin == film_type))

temp_0 = df_all_0.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_0')
temp_0 = temp_0.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_0'))

temp_1 = df_all_1.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_1')
temp_1 = temp_1.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_1'))

film_bin_stat = temp_0.join(temp_1, on=['user_id', 'year_bin'], how='left').cache()

film_bin_stat_2 = film_bin_stat.withColumn('film_bin_stat_2', \
                         film_bin_stat['cnt_1']  / film_bin_stat['cnt_0'] )\
                        .select(['user_id', 'year_bin', 'film_bin_stat_2'])

del temp_0, temp_1

film_type = 'middle'
df_all_1 = df_all.filter((df_all.purchase == '1') & (df_all.year_bin == film_type))
df_all_0 = df_all.filter((df_all.purchase == '0') & (df_all.year_bin == film_type))

temp_0 = df_all_0.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_0')
temp_0 = temp_0.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_0'))

temp_1 = df_all_1.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_1')
temp_1 = temp_1.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_1'))

film_bin_stat = temp_0.join(temp_1, on=['user_id', 'year_bin'], how='left').cache()

film_bin_stat_3 = film_bin_stat.withColumn('film_bin_stat_3', \
                         film_bin_stat['cnt_1']  / film_bin_stat['cnt_0'] )\
                        .select(['user_id', 'year_bin', 'film_bin_stat_3'])

del temp_0, temp_1

film_type = 'middle_new'
df_all_1 = df_all.filter((df_all.purchase == '1') & (df_all.year_bin == film_type))
df_all_0 = df_all.filter((df_all.purchase == '0') & (df_all.year_bin == film_type))

temp_0 = df_all_0.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_0')
temp_0 = temp_0.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_0'))

temp_1 = df_all_1.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_1')
temp_1 = temp_1.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_1'))

film_bin_stat = temp_0.join(temp_1, on=['user_id', 'year_bin'], how='left').cache()

film_bin_stat_4 = film_bin_stat.withColumn('film_bin_stat_4', \
                         film_bin_stat['cnt_1']  / film_bin_stat['cnt_0'] )\
                        .select(['user_id', 'year_bin', 'film_bin_stat_4'])

del temp_0, temp_1

film_type = 'new'
df_all_1 = df_all.filter((df_all.purchase == '1') & (df_all.year_bin == film_type))
df_all_0 = df_all.filter((df_all.purchase == '0') & (df_all.year_bin == film_type))

temp_0 = df_all_0.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_0')
temp_0 = temp_0.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_0'))

temp_1 = df_all_1.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_1')
temp_1 = temp_1.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_1'))

film_bin_stat = temp_0.join(temp_1, on=['user_id', 'year_bin'], how='left').cache()

film_bin_stat_5 = film_bin_stat.withColumn('film_bin_stat_5', \
                         film_bin_stat['cnt_1']  / film_bin_stat['cnt_0'] )\
                        .select(['user_id', 'year_bin', 'film_bin_stat_5'])

del temp_0, temp_1

film_type = 'newest'
df_all_1 = df_all.filter((df_all.purchase == '1') & (df_all.year_bin == film_type))
df_all_0 = df_all.filter((df_all.purchase == '0') & (df_all.year_bin == film_type))

temp_0 = df_all_0.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_0')
temp_0 = temp_0.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_0'))

temp_1 = df_all_1.groupBy("user_id", "year_bin").agg(count("*")).alias('cnt_1')
temp_1 = temp_1.select(col("user_id"), col("year_bin"), col('count(1)').alias('cnt_1'))

film_bin_stat = temp_0.join(temp_1, on=['user_id', 'year_bin'], how='left').cache()

film_bin_stat_6 = film_bin_stat.withColumn('film_bin_stat_6', \
                         film_bin_stat['cnt_1']  / film_bin_stat['cnt_0'] )\
                        .select(['user_id', 'year_bin', 'film_bin_stat_6'])

del temp_0, temp_1

df_all = df_all.join(film_bin_stat_1, on=['user_id', 'year_bin'], how='left').cache()
df_all = df_all.join(film_bin_stat_2, on=['user_id', 'year_bin'], how='left').cache()
df_all = df_all.join(film_bin_stat_3, on=['user_id', 'year_bin'], how='left').cache()
df_all = df_all.join(film_bin_stat_4, on=['user_id', 'year_bin'], how='left').cache()
df_all = df_all.join(film_bin_stat_5, on=['user_id', 'year_bin'], how='left').cache()
df_all = df_all.join(film_bin_stat_6, on=['user_id', 'year_bin'], how='left').cache()

**Флаг, что пользователь покупал хотя бы один раз**

In [49]:
df_all_1 = df_all.filter(df_all.purchase == '1').select(['user_id', 'purchase'])
df_all_1 = df_all_1.dropDuplicates(['user_id'])
user_sell_flags = df_all_1.withColumnRenamed('purchase', 'sell_flag')

df_all = df_all.join(user_sell_flags, on=['user_id'], how='left').cache()

**Как интенсивно покупает пользователь**

In [51]:
df_all_1 = df_all.filter(df_all.purchase == '1').select(['user_id', 'purchase'])

sale_user_count = df_all_1.groupBy("user_id")\
                .count()\
                .withColumnRenamed("count", "sale_user_count")
df_all = df_all.join(sale_user_count, on=['user_id'], how='left').cache()

**Покупаемость item_id**

In [54]:
df_all_1 = df_all.filter(df_all.purchase == '1').select(['item_id', 'purchase'])

sale_item_count = df_all_1.groupBy("item_id")\
                .count()\
                .withColumnRenamed("count", "sale_item_count")
df_all = df_all.join(sale_item_count, on=['item_id'], how='left').cache()

# df_all = df_all.select(['user_id', 'item_id', 'purchase', 'year', 'film_percent_sales', 'user_percent_sales'])

df_all = df_all.select(['user_id','item_id','purchase', 'year_bin', 'genre_3', 'genre_2', 'genre_1', 
                        'year',  'film_percent_sales', 'user_percent_sales', 'user_percent_years', 
                        'user_percent_genres', 'user_percent_genres_2', 'user_percent_genres_3', 
                        'film_bin_stat_1', 'film_bin_stat_2', 'film_bin_stat_3', 'film_bin_stat_4', 
                        'film_bin_stat_5', 'film_bin_stat_6', 'sell_flag', 'sale_user_count', 'sale_item_count'])

Categorical Features

In [58]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="genre_1", outputCol="genre_1_cat")
df_all = indexer.setHandleInvalid("keep").fit(df_all).transform(df_all)
print('done_1')
indexer = StringIndexer(inputCol="genre_2", outputCol="genre_2_cat")
df_all = indexer.setHandleInvalid("keep").fit(df_all).transform(df_all)
print('done_2')
indexer = StringIndexer(inputCol="genre_3", outputCol="genre_3_cat")
df_all = indexer.setHandleInvalid("keep").fit(df_all).transform(df_all)
print('done_3')
indexer = StringIndexer(inputCol="year_bin", outputCol="year_bin_cat")
df_all = indexer.setHandleInvalid("keep").fit(df_all).transform(df_all)

done_1
done_2
done_3


**Filling NaN**

In [60]:
# df_all = df_all.fillna(0, subset=['year', 'film_percent_sales', 'user_percent_sales'])
df_all = df_all.fillna(0, subset=['year_bin_cat', 'genre_3_cat', 'genre_2_cat', 'genre_1_cat', 
                        'year',  'film_percent_sales', 'user_percent_sales', 'user_percent_years', 
                        'user_percent_genres', 'user_percent_genres_2', 'user_percent_genres_3', 
                        'film_bin_stat_1', 'film_bin_stat_2', 'film_bin_stat_3', 'film_bin_stat_4', 
                        'film_bin_stat_5', 'film_bin_stat_6', 'sell_flag', 'sale_user_count', 'sale_item_count'])

df_all = df_all.select(['user_id','item_id','purchase', 'year_bin_cat', 'genre_3_cat', 'genre_2_cat', 'genre_1_cat', 
                        'year',  'film_percent_sales', 'user_percent_sales', 'user_percent_years', 
                        'user_percent_genres', 'user_percent_genres_2', 'user_percent_genres_3', 
                        'film_bin_stat_1', 'film_bin_stat_2', 'film_bin_stat_3', 'film_bin_stat_4', 
                        'film_bin_stat_5', 'film_bin_stat_6', 'sell_flag', 'sale_user_count', 'sale_item_count'])

**Convert data types**

In [63]:
from pyspark.sql.types import FloatType

df_all = df_all.withColumn("year_bin_cat", df_all["year_bin_cat"].cast(FloatType()))
df_all = df_all.withColumn("genre_3_cat", df_all["genre_3_cat"].cast(FloatType()))
df_all = df_all.withColumn("genre_2_cat", df_all["genre_2_cat"].cast(FloatType()))
df_all = df_all.withColumn("genre_1_cat", df_all["genre_1_cat"].cast(FloatType()))
df_all = df_all.withColumn("film_percent_sales", df_all["film_percent_sales"].cast(FloatType()))
df_all = df_all.withColumn("user_percent_sales", df_all["user_percent_sales"].cast(FloatType()))
df_all = df_all.withColumn("user_percent_years", df_all["user_percent_years"].cast(FloatType()))
df_all = df_all.withColumn("user_percent_genres", df_all["user_percent_genres"].cast(FloatType()))
df_all = df_all.withColumn("user_percent_genres_2", df_all["user_percent_genres_2"].cast(FloatType()))
df_all = df_all.withColumn("user_percent_genres_3", df_all["user_percent_genres_3"].cast(FloatType()))
df_all = df_all.withColumn("film_bin_stat_1", df_all["film_bin_stat_1"].cast(FloatType()))
df_all = df_all.withColumn("film_bin_stat_2", df_all["film_bin_stat_2"].cast(FloatType()))
df_all = df_all.withColumn("film_bin_stat_3", df_all["film_bin_stat_3"].cast(FloatType()))
df_all = df_all.withColumn("film_bin_stat_4", df_all["film_bin_stat_4"].cast(FloatType()))
df_all = df_all.withColumn("film_bin_stat_5", df_all["film_bin_stat_5"].cast(FloatType()))
df_all = df_all.withColumn("film_bin_stat_6", df_all["film_bin_stat_6"].cast(FloatType()))

df_all = df_all.withColumn("sale_user_count", df_all["sale_user_count"].cast(FloatType()))
df_all = df_all.withColumn("sale_item_count", df_all["sale_item_count"].cast(FloatType()))

Round values

In [65]:
import pyspark.sql.functions as func
df_all = df_all.withColumn("year_bin_cat", func.round(df_all["year_bin_cat"], 6))
df_all = df_all.withColumn("genre_3_cat", func.round(df_all["genre_3_cat"], 6))
df_all = df_all.withColumn("genre_2_cat", func.round(df_all["genre_2_cat"], 6))
df_all = df_all.withColumn("genre_1_cat", func.round(df_all["genre_1_cat"], 6))
df_all = df_all.withColumn("film_percent_sales", func.round(df_all["film_percent_sales"], 6))
df_all = df_all.withColumn("user_percent_sales", func.round(df_all["user_percent_sales"], 6))
df_all = df_all.withColumn("user_percent_years", func.round(df_all["user_percent_years"], 6))
df_all = df_all.withColumn("user_percent_genres", func.round(df_all["user_percent_genres"], 6))
df_all = df_all.withColumn("user_percent_genres_2", func.round(df_all["user_percent_genres_2"], 6))
df_all = df_all.withColumn("user_percent_genres_3", func.round(df_all["user_percent_genres_3"], 6))
df_all = df_all.withColumn("film_bin_stat_1", func.round(df_all["film_bin_stat_1"], 6))
df_all = df_all.withColumn("film_bin_stat_2", func.round(df_all["film_bin_stat_2"], 6))
df_all = df_all.withColumn("film_bin_stat_3", func.round(df_all["film_bin_stat_3"], 6))
df_all = df_all.withColumn("film_bin_stat_4", func.round(df_all["film_bin_stat_4"], 6))
df_all = df_all.withColumn("film_bin_stat_5", func.round(df_all["film_bin_stat_5"], 6))
df_all = df_all.withColumn("film_bin_stat_6", func.round(df_all["film_bin_stat_6"], 6))

df_all = df_all.withColumn("sale_user_count", func.round(df_all["sale_user_count"], 6))
df_all = df_all.withColumn("sale_item_count", func.round(df_all["sale_item_count"], 6))

**Modeling**

Create feature vector

In [67]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=['year_bin_cat', 'genre_3_cat', 'genre_2_cat', 'genre_1_cat', 
                        'year',  'film_percent_sales', 'user_percent_sales', 'user_percent_years', 
                        'user_percent_genres', 'user_percent_genres_2', 'user_percent_genres_3', 
                        'film_bin_stat_1', 'film_bin_stat_2', 'film_bin_stat_3', 'film_bin_stat_4', 
                        'film_bin_stat_5', 'film_bin_stat_6', 'sell_flag', 'sale_user_count', 'sale_item_count'],
    outputCol="features")

df_all_transformed = assembler.transform(df_all)


df_all_transformed = df_all_transformed.select(['user_id', 'item_id', 'purchase','features'])

**Normalization**

In [83]:
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
normalized_data = normalizer.transform(df_all_transformed)

**Split data**

In [86]:
# train = df_all_transformed.filter(df_all_transformed.purchase.isNotNull())
# test = df_all_transformed.filter(df_all_transformed.purchase.isNull())

train = normalized_data.filter(normalized_data.purchase.isNotNull())
test = normalized_data.filter(normalized_data.purchase.isNull())

In [87]:
train.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- normFeatures: vector (nullable = true)



Split train df to check model metrics

In [88]:
train_sample = train.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=5757)
val_sample = train.join(train_sample, on=['user_id', 'item_id'], how="leftanti")

Model

**GBDT**

In [None]:
%%time
gbt_model = gbt.fit(train_sample)
# gbt_model = gbt.fit(train_short)

In [232]:
gbt_model.featureImportances

SparseVector(18, {0: 0.0089, 1: 0.0611, 2: 0.0551, 3: 0.0257, 4: 0.0269, 5: 0.1776, 6: 0.129, 7: 0.1504, 8: 0.1004, 9: 0.1712, 10: 0.0896, 11: 0.0011, 12: 0.0004, 14: 0.0012, 16: 0.0015})

In [233]:
predictions_gbt = gbt_model.transform(val_sample)

In [79]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", \
                                          labelCol="purchase", metricName='areaUnderROC')

In [None]:
evaluator.evaluate(predictions_gbt)

**Random Forest**

**LR**

In [99]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol="normFeatures", labelCol='purchase', maxIter=15, regParam=0.01)

In [100]:
%%time
lr_model = lr.fit(train_sample)

CPU times: user 130 ms, sys: 7.4 ms, total: 137 ms
Wall time: 1min 13s


In [101]:
predictions_lr = lr_model.transform(val_sample)

In [102]:
evaluator.evaluate(predictions_lr)

0.9636491532290061

In [98]:
lr_model.maxIter

Param(parent='LogisticRegression_a19ae7bef07a', name='maxIter', doc='maximum number of iterations (>= 0)')

In [None]:
0.9544210565111111

0.951874561786903

0.9473850805864773

**Cross validation**

---
**Make prediction**

In [104]:
# predictions_test = gbt_model.transform(test)
predictions_test = lr_model.transform(test)

In [105]:
predictions_test.count()

2156840

In [80]:
test.count()

2156840

In [81]:
predictions_test.show(2)

+-------+-------+--------+--------------------+--------------------+--------------------+----------+
|user_id|item_id|purchase|            features|       rawPrediction|         probability|prediction|
+-------+-------+--------+--------------------+--------------------+--------------------+----------+
| 867363|  73483|    null|(18,[0,1,2,3,4,5,...|[1.54369627086296...|[0.95636969378535...|       0.0|
| 867363|  10558|    null|(18,[0,1,2,3,4,5,...|[1.54369627086296...|[0.95636969378535...|       0.0|
+-------+-------+--------+--------------------+--------------------+--------------------+----------+
only showing top 2 rows



In [106]:
predictions_test.select(['user_id', 'item_id', 'probability']).show(2, False)

+-------+-------+------------------------------------------+
|user_id|item_id|probability                               |
+-------+-------+------------------------------------------+
|871154 |100140 |[0.9988080672995057,0.0011919327004943977]|
|878352 |100140 |[0.9987286834814678,0.001271316518532219] |
+-------+-------+------------------------------------------+
only showing top 2 rows



In [107]:
from pyspark.sql import functions as f
lastelement=f.udf(lambda v:float(v[1]),FloatType())
predictions_test = predictions_test.withColumn("purchase", lastelement("probability")).\
                    select(['user_id', 'item_id', 'purchase'])

In [86]:
predictions_test.count()

2156840

In [84]:
predictions_test.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- purchase: float (nullable = true)



In [108]:
predictions_test_sorted = predictions_test.orderBy(["user_id", "item_id"], ascending=[1, 1])

In [109]:
predictions_test_sorted_repart = predictions_test_sorted.repartition(1)

In [110]:
# predictions_test_sorted.write.csv('lab03.csv')
predictions_test_sorted_repart.write.csv('lab03_13_03_1500')

**Move submission from hdfs to local**

In [116]:
! hdfs dfs -ls /user/dmitry.ulogov/

Found 7 items
drwx------   - dmitry.ulogov dmitry.ulogov          0 2021-03-13 15:33 /user/dmitry.ulogov/.Trash
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-13 11:40 /user/dmitry.ulogov/.sparkStaging
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-12 17:31 /user/dmitry.ulogov/gbt_model
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov         84 2021-02-26 21:44 /user/dmitry.ulogov/lab01.json
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov        458 2021-03-05 15:32 /user/dmitry.ulogov/lab02.json
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-12 20:36 /user/dmitry.ulogov/lab03
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-13 15:32 /user/dmitry.ulogov/lab03_13_03_1500


In [115]:
! hdfs dfs -rm -r /user/dmitry.ulogov/lab03.csv

21/03/13 15:33:36 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-de-master-1.newprolab.com:8020/user/dmitry.ulogov/lab03.csv' to trash at: hdfs://spark-de-master-1.newprolab.com:8020/user/dmitry.ulogov/.Trash/Current/user/dmitry.ulogov/lab03.csv


In [None]:
! hdfs dfs -rm -r /user/dmitry.ulogov/lab03

In [118]:
! hdfs dfs -ls /user/dmitry.ulogov/lab03_13_03_1500

Found 2 items
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov          0 2021-03-13 15:32 /user/dmitry.ulogov/lab03_13_03_1500/_SUCCESS
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov   55481663 2021-03-13 15:32 /user/dmitry.ulogov/lab03_13_03_1500/lab03.csv


In [117]:
! hdfs dfs -mv /user/dmitry.ulogov/lab03_13_03_1500/part-00000-177647e9-5f42-4b7c-9d43-e38b68d94505-c000.csv /user/dmitry.ulogov/lab03_13_03_1500/lab03.csv

In [119]:
! hdfs dfs -get /user/dmitry.ulogov/lab03_13_03_1500/lab03.csv /data/home/dmitry.ulogov

In [None]:
! hdfs dfs -rm -r /user/dmitry.ulogov/lab03_13_03_1500

In [122]:
import pandas as pd
sub = pd.read_csv('lab03.csv', header=None)
sub.columns = ['user_id', 'item_id', 'purchase']

In [126]:
sub = sub.sort_values(by=['user_id', 'item_id'])
sub.to_csv('lab03.csv')

In [91]:
! hdfs dfs -ls /user/dmitry.ulogov/lab03

Found 2 items
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov          0 2021-03-12 20:24 /user/dmitry.ulogov/lab03/_SUCCESS
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov   53341201 2021-03-12 20:24 /user/dmitry.ulogov/lab03/part-00000-1a57cf5d-6e25-445c-ab6e-6a08e160ea04-c000.csv


In [95]:
! hdfs dfs -mv /user/dmitry.ulogov/lab03/part-00000-1a57cf5d-6e25-445c-ab6e-6a08e160ea04-c000.csv /user/dmitry.ulogov/lab03/lab03.csv

In [96]:
! hdfs dfs -ls /user/dmitry.ulogov/lab03

Found 2 items
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov          0 2021-03-12 20:24 /user/dmitry.ulogov/lab03/_SUCCESS
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov   53341201 2021-03-12 20:24 /user/dmitry.ulogov/lab03/lab03.csv


In [99]:
! hdfs dfs -cp /user/dmitry.ulogov/lab03/lab03.csv /user/dmitry.ulogov

In [100]:
! hdfs dfs -ls /user/dmitry.ulogov/

Found 7 items
drwx------   - dmitry.ulogov dmitry.ulogov          0 2021-03-12 15:32 /user/dmitry.ulogov/.Trash
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-12 18:50 /user/dmitry.ulogov/.sparkStaging
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-12 17:31 /user/dmitry.ulogov/gbt_model
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov         84 2021-02-26 21:44 /user/dmitry.ulogov/lab01.json
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov        458 2021-03-05 15:32 /user/dmitry.ulogov/lab02.json
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-12 20:36 /user/dmitry.ulogov/lab03
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov   53341201 2021-03-12 20:39 /user/dmitry.ulogov/lab03.csv


---

In [115]:
import pandas as pd
my_sub = pd.read_csv('lab03.csv',header=None)
my_sub.columns = ['user_id', 'item_id', 'purchase']

In [120]:
my_sub = my_sub.sort_values(by=['user_id', 'item_id'], ascending = True)
my_sub.to_csv('lab03.csv')

In [None]:
sc.stop()