In [None]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 2 --driver-memory 4g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "RIK_lab3") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
spark

In [None]:
from pyspark import keyword_only

from pyspark.ml import Transformer, Pipeline
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover, CountVectorizer, VectorAssembler
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier, MultilayerPerceptronClassifier
    
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, DoubleType, FloatType, ArrayType, StringType, IntegerType
from pyspark.sql.window import Window
from pyspark.sql.functions import udf, col, when, isnan, isnull, broadcast, lower, pandas_udf, row_number, explode, split
from pyspark.sql.functions import array, collect_set, collect_list, lit, asc, desc, sum, count, PandasUDFType

from pyspark.mllib.linalg import Vectors, SparseVector, DenseVector, VectorUDT

import json
import re

In [None]:
!hdfs dfs -ls /labs/slaba03/

In [None]:
!hdfs dfs -head /labs/slaba03/laba03_items.csv

# train.csv

In [None]:
schema = (StructType()
      .add("user_id", IntegerType(), True)
      .add("item_id", IntegerType(), True)
      .add("purchase", IntegerType(), True))
      
df_user = (spark.read.format("csv")
           .option("header", True)
           .schema(schema)
           .load("/labs/slaba03/laba03_train.csv")
           .repartition(10)
           .cache())

In [None]:
df_user.show(5)

# test.csv

In [None]:
schema = (StructType()
      .add("user_id", IntegerType(), True)
      .add("item_id", IntegerType(), True)) 
      
      
df_user_test = (spark.read.format("csv")
                .option("header", True)
                .schema(schema)
                .load("/labs/slaba03/laba03_test.csv")
                .repartition(10)
                .cache())

In [None]:
df_user_test.where(df_user_test.item_id.isin(ids)).toPandas()

# views_programmes.csv

In [None]:
read_users_schema = StructType(fields=[
    StructField('user_id', IntegerType()),
    StructField('item_id', IntegerType()),
    StructField('ts_start', IntegerType()),
    StructField('ts_end', IntegerType()),
    StructField('item_type', StringType()),
])

df_views_programmes = spark.read.format("csv") \
      .option("header", True) \
      .schema(read_users_schema) \
      .load("/labs/slaba03/laba03_views_programmes.csv")

df_views_programmes = (df_views_programmes
                       .withColumn('duration', df_views_programmes.ts_end - df_views_programmes.ts_start)
                       .drop('ts_start', 'ts_end'))

df_views_programmes = df_views_programmes.groupby('user_id', 'item_id', 'item_type').agg(sum("duration").alias("duration"))

df_views_programmes = df_views_programmes.repartition(10).cache()

df_views_programmes.limit(5).toPandas()

# items.csv

In [None]:
read_items_schema = StructType(fields=[
    StructField('item_id', IntegerType()), 
    StructField('channel_id', FloatType(), nullable=True),
    StructField('datetime_availability_start', StringType(), nullable=True),
    StructField('datetime_availability_stop', StringType(), nullable=True),
    StructField('datetime_show_start', StringType(), nullable=True),
    StructField('datetime_show_stop', StringType(), nullable=True),
    StructField('content_type', IntegerType()),
    StructField('title', StringType(), nullable=True),
    StructField('year', FloatType(), nullable=True),
    StructField('genres', StringType(), nullable=True),
    StructField('region_id', FloatType(), nullable=True),
])

df_items = (spark.read.format("csv")
            .option("header", True)
            .option("sep", "\t")
            .schema(read_items_schema)
            .load("/labs/slaba03/laba03_items.csv")
           )

df_items = (df_items
            .withColumn("year", 
                        when(df_items.item_id == 103377, 2008.0)
                        .when(df_items.item_id == 95141, 2014.0)
                        .when(df_items.item_id == 72544, 2009.0)
                        .when(df_items.item_id == 8544, 1994.0)
                        .otherwise(df_items.year))
            .withColumn("genres", 
                        when(df_items.item_id == 103377, 'Анимация,Короткометражные')
                        .otherwise(df_items.genres))
           )
    
df_items = (df_items
            .repartition(10)
            .cache()
           )

# print(df_items.filter(df_items.item_id.isNull()).count())

# Расчет фичей года выпуска для контента

In [None]:
df_items = (
    df_items
    .withColumn("year_cat", array((((df_items.year - 1910.0)/10) + 1).cast(IntegerType()).cast(StringType())))
    .withColumn("year_cat_str", (((df_items.year - 1910.0)/10) + 1).cast(IntegerType()).cast(StringType()))
)

# df_items.filter(df_items.year.isNotNull()).limit(2).toPandas()

In [None]:
df_items = df_items.drop('year_cat_vector')
count_vectorizer_year = CountVectorizer(inputCol='year_cat', outputCol="year_cat_vector", binary=False)
count_vectorizer_year_model = count_vectorizer_year.fit(df_items)
df_items = count_vectorizer_year_model.transform(df_items)


normalizer_year = Normalizer(inputCol='year_cat_vector', outputCol="year_cat_norm")
df_items_year = normalizer_year.transform(df_items).select('item_id', 'year_cat_norm')
df_items = df_items.drop("year_cat_vector")

df_items_year.show(2)

# Расчет фичей года выпуска для клиента

In [None]:
df_user_year = (df_user
                .filter(df_user.purchase == 1)
                .join(df_items, df_user.item_id == df_items.item_id, 'left')
                .select(df_user.user_id, df_items.year_cat_str.alias("year_cat_str")))
           
df_user_uniq = df_user.select('user_id').distinct()
df_user_year = (df_user_uniq
                .join(df_user_year, df_user_uniq.user_id == df_user_year.user_id, 'left')
                .select(df_user_uniq.user_id, df_user_year.year_cat_str))
    
df_user_year = df_user_year.groupBy('user_id').agg(collect_set('year_cat_str').alias('year_cat'))
df_user_year = count_vectorizer_year_model.transform(df_user_year)
df_user_year = normalizer_year.transform(df_user_year)
df_user_year = df_user_year.drop("year_cat", "year_cat_vector")

df_user_year.show(2)

# Расчет времени просмотра контента и пользователя

In [None]:
df_item_duration = (
    df_views_programmes.join(df_items, df_views_programmes.item_id == df_items.item_id, 'inner')
    .select(df_items.item_id, df_views_programmes.duration).groupby('item_id').sum("duration")
    .selectExpr(["item_id", "`sum(duration)` as one_duration"])
)

max_duration = df_item_duration.selectExpr("max(one_duration)").collect()[0][0]

df_item_duration = (
    df_item_duration.select('item_id', 'one_duration', lit(max_duration).alias('max_duration'))
    .selectExpr(['item_id', 'one_duration / max_duration as duration'])
)

df_item_duration = (df_items
                    .join(df_item_duration, df_item_duration.item_id == df_items.item_id, 'left')
                    .select(df_items.item_id, df_item_duration.duration)
                    .fillna(value=0.0, subset=["duration"])
                    .distinct()
                    .coalesce(10)
                    .cache())

# df_item_duration.show(2)

In [None]:
df_user_duration = (
    df_views_programmes.join(df_user, df_views_programmes.user_id == df_user.user_id, 'inner')
    .select(df_user.user_id, df_views_programmes.duration).groupby('user_id').sum("duration")
    .selectExpr(["user_id", "`sum(duration)` as one_duration"])
)

max_duration = df_user_duration.selectExpr("max(one_duration)").collect()[0][0]

df_user_duration = (
    df_user_duration.select('user_id', 'one_duration', lit(max_duration).alias('max_duration'))
    .selectExpr(['user_id', 'one_duration / max_duration as duration'])
)


df_user_duration = (df_user
                    .join(df_user_duration, df_user_duration.user_id == df_user.user_id, 'left')
                    .select(df_user.user_id, df_user_duration.duration)
                    .fillna(value=0.0, subset=["duration"])
                    .distinct()
                    .coalesce(10)
                    .cache())

# df_user_duration.show(5)

# Расчет индекса покупаемости для контента

In [None]:
df_user_item_stat = df_user.groupby("item_id").agg(sum("purchase").alias("sum_purchase"), count("purchase").alias("count_purchase"))
df_user_item_stat = df_user_item_stat.withColumn("item_purchase_rate", df_user_item_stat.sum_purchase / df_user_item_stat.count_purchase)
df_user_item_stat = df_user_item_stat.select("item_id", "item_purchase_rate")

df_user_item_stat.show(2)

# Расчет индекса покупаемости для клиента

In [None]:
df_user_user_stat = df_user.groupby("user_id").agg(sum("purchase").alias("sum_purchase"), count("purchase").alias("count_purchase"))
df_user_user_stat = df_user_user_stat.withColumn("user_purchase_rate", df_user_user_stat.sum_purchase / df_user_user_stat.count_purchase)
df_user_user_stat = df_user_user_stat.select("user_id", "user_purchase_rate")

df_user_user_stat.show(2)

# Расчет фичей жанра для контента

In [None]:
def replace_genres(s):
    replace_map = {
        'Арт-хаус': 'Артхаус',
        'Боевики': 'Боевик',
        'Военные': 'Военный',
        'Военные': 'Военный',
        'Детские': 'Детский',
        'Для детей': 'Детский',
        'Для самых маленьких': 'Детский',
        'Для всей семьи': 'Семейные',
        'Для взрослых': 'Эротика',
        'Документальные': 'Документальный',
        'Драмы': 'Драма',
        'Западные мультфильмы': 'Зарубежные,Анимация',
        'Исторические': 'Исторический',
        'Короткометражки': 'Короткометражные',
        'Детский песни': 'Детский,Музыкальные',
        'Мультфильмы в 3D': 'Анимация',
        'Мультфильмы': 'Анимация',
        'Мультсериалы': 'Анимация,Сериалы',
        'Мюзиклы': 'Музыкальные',
        'Русские мультфильмы': 'Анимация,Русские',
        'Аниме': 'Анимация',
        'Спорт': 'Спортивные',
        'Спортивныеивные': 'Спортивные',
        'Наши': 'Русские',
        'Фильмы в 3D': 'Фильмы',
        'Юмористические': 'Юмористические,Передачи',
        'Кулинария': 'Передачи',
        'Игры': 'Передачи',
        'О здоровье': 'Передачи',
        'Охота и рыбалка': 'Передачи',
        'Реалити-шоу': 'Передачи',
        'Видеоигры': 'Видеоигры,Передачи',
        'Фильмы-спектакли': 'Музыкальные,Фильмы',
        'Познавательные': 'Развивающие,Передачи',
        'Хочу всё знать': 'Развивающие,Передачи',
        'Фантастические': 'Фантастика',
        'Фэнтези': 'Фантастика',
        'Союзмультфильм': 'Союзмультфильм,Анимация',
        'Юмористические': 'Комедии',
        'Развлекательные': 'Комедии',
        'Комедия': 'Комедии',
        'Вестерн': 'Фильмы,Зарубежные,Боевик',
        'Советское кино': 'Советские,Фильмы',
        'Прочие': 'General',
        'Мультфильм': 'Анимация',
        'Музыкальный': 'Музыкальные',
        'Семейный': 'Семейные',
        'Приключение': 'Приключения',
        'Научная фантастика': 'Фантастика',
        'сказка': 'Сказки',
        'Триллер': 'Триллеры',
    }
    if s is None:
        return ['General']
    
    for key in replace_map:
        s = str(s).replace(key, replace_map[key])
        
    return s.split(',')

replace_genres_udf = udf(replace_genres, ArrayType(StringType()))

In [None]:
df_items_genres = df_items.withColumn("genres_arr", replace_genres_udf("genres"))

count_vectorizer = CountVectorizer(inputCol='genres_arr', outputCol="genres_vector", binary=False)
count_vectorizer_model = count_vectorizer.fit(df_items_genres)
df_items_genres = count_vectorizer_model.transform(df_items_genres)

normalizer = Normalizer(inputCol='genres_vector', outputCol="genres_norm")
df_items_genres = normalizer.transform(df_items_genres)

df_items_genres = df_items_genres.select('item_id', 'genres_norm')

df_items_genres.show(2)

# Расчет фичей жанра для клиента

In [None]:
df_user_genres = (
    df_user
    .join(df_items, df_user.item_id == df_items.item_id, 'inner')
    .select(
        df_user.user_id, 
        replace_genres_udf(df_items.genres).alias("genres_arr"), 
        df_user.purchase
    )
)


df_user_genres = df_user_genres.select(
    df_user_genres.user_id, 
    explode(df_user_genres.genres_arr).alias('genres'), 
    df_user.purchase
)

df_user_genres_all = df_user_genres.groupBy('user_id').agg(collect_list('genres').alias('genres_arr'))
df_user_genres_all = count_vectorizer_model.transform(df_user_genres_all)

df_user_genres_purchase = df_user_genres.filter(df_user.purchase == 1)
df_user_uniq = df_user.select('user_id').distinct()
df_user_genres_purchase = (df_user_uniq
                .join(df_user_genres_purchase, df_user_uniq.user_id == df_user_genres_purchase.user_id, 'left')
                .select(df_user_uniq.user_id, df_user_genres_purchase.genres))

df_user_genres_purchase = (
    df_user_genres_purchase
    .groupBy('user_id')
    .agg(collect_list('genres').alias('genres_arr')))
df_user_genres_purchase = count_vectorizer_model.transform(df_user_genres_purchase)

df_user_genres = df_user_genres_all.join(
    df_user_genres_purchase,
    df_user_genres_all.user_id == df_user_genres_purchase.user_id,
    'inner'
).select(
    df_user_genres_all.user_id, 
    df_user_genres_all.genres_vector.alias('genres_vector_all'),
    df_user_genres_purchase.genres_vector.alias('genres_vector_purchase') 
)

normalizer = Normalizer(inputCol='genres_vector_all', outputCol="genres_norm_all")
df_user_genres = normalizer.transform(df_user_genres)

normalizer = Normalizer(inputCol='genres_vector_purchase', outputCol="genres_norm_purchase")
df_user_genres = normalizer.transform(df_user_genres)

df_user_genres.select('user_id', 'genres_norm_all', 'genres_norm_purchase').coalesce(10).cache()

df_user_genres.show(2)

# Собираем набор для обучения

In [None]:
df_train = df_user.select(df_user.user_id, df_user.item_id, df_user.purchase.alias('target'))
# df_train.show(5)

In [None]:
# добавляем фичи жанра
df_train = (df_train
            .join(df_user_genres, df_train.user_id == df_user_genres.user_id, 'inner')
            .join(df_items_genres, df_train.item_id == df_items_genres.item_id, 'inner')
            .select(
                df_train.user_id,
                df_train.item_id,
                df_train.target,
                
                df_user_genres.genres_norm_all,
                df_user_genres.genres_norm_purchase,
                df_items_genres.genres_norm
            )
            .coalesce(10)
            .cache()
           )

# df_train.show(2)

In [None]:
# добавляем фичи покупаемости
df_train = (df_train
            .join(df_user_user_stat, df_user_user_stat.user_id == df_train.user_id, 'inner')
            .join(df_user_item_stat, df_user_item_stat.item_id == df_train.item_id, 'inner')          
            .select(
                df_train.user_id,
                df_train.item_id,
                df_train.target,
                df_train.genres_norm_all,
                df_train.genres_norm_purchase,
                df_train.genres_norm,
                
                df_user_user_stat.user_purchase_rate,
                df_user_item_stat.item_purchase_rate,
            )
            .coalesce(10)
            .cache()
           )

# df_train.filter((df_train.user_id == 816426) & (df_train.item_id == 91200)).show()
# df_train.show(2)

In [None]:
# добавляем фичи года выпуска
df_train = (df_train
            .join(df_user_year, df_user_year.user_id == df_train.user_id, 'inner')
            .join(df_items_year, df_items_year.item_id == df_train.item_id, 'inner')          
            .select(
                df_train.user_id,
                df_train.item_id,
                df_train.target,
                df_train.genres_norm_all,
                df_train.genres_norm_purchase,
                df_train.genres_norm,
                df_train.user_purchase_rate,
                df_train.item_purchase_rate,
                
                df_user_year.year_cat_norm.alias('user_year_norm'),
                df_items_year.year_cat_norm.alias('item_year_norm')
            )
            .coalesce(10)
            .cache())

# df_train.show(2)

In [None]:
# добавляем фичи длительности просмотра
df_train = (df_train
            .join(df_user_duration, df_user_duration.user_id == df_train.user_id, 'inner')
            .join(df_item_duration, df_item_duration.item_id == df_train.item_id, 'inner')          
            .select(
                df_train.user_id,
                df_train.item_id,
                df_train.target,
                df_train.genres_norm_all,
                df_train.genres_norm_purchase,
                df_train.genres_norm,
                df_train.user_purchase_rate,
                df_train.item_purchase_rate,
                df_train.user_year_norm,
                df_train.item_year_norm,
                
                df_user_duration.duration.alias('user_duration'),
                df_item_duration.duration.alias('item_duration')
            )
            .coalesce(10)
            .cache())

# df_train.show(2)

In [None]:
# объединяем все фичи
assembler = VectorAssembler(
    inputCols=[
        "genres_norm_all", 
        "genres_norm_purchase", 
        "genres_norm", 
        "user_purchase_rate", "item_purchase_rate",
        "user_year_norm", "item_year_norm",
        "user_duration", "item_duration"
    ], 
    outputCol="features"
)

df_train = assembler.transform(df_train).select("features", "target")
df_train.show(2)

In [None]:
# балансируем выборку
# print(df_train.filter(df_train.target == 0).count())
# print(df_train.filter(df_train.target == 1).count())

samle_count = df_train.filter(df_train.target == 1).count() / df_train.filter(df_train.target == 0).count()
df_train = df_train.filter(df_train.target == 1).union(df_train.filter(df_train.target == 0).sample(samle_count)).coalesce(10)

print(df_train.filter(df_train.target == 0).count())
print(df_train.filter(df_train.target == 1).count())

# Подбор гиперпараметров

In [None]:
# gb = GBTClassifier(labelCol="target", featuresCol="features")
rf = RandomForestClassifier(labelCol="target", featuresCol='features', maxDepth=30, maxBins=16)

gbparamGrid = (ParamGridBuilder()
#                .addGrid(rf.maxDepth, [20, 30])
               .addGrid(rf.numTrees, [50, 100, 150])
#                .addGrid(rf.maxBins, [4 , 8, 16])

               .build())

gbevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="target")

gbcv = CrossValidator(estimator=rf,
                      estimatorParamMaps=gbparamGrid,
                      evaluator=gbevaluator,
                      numFolds=5,
                      #parallelism=2
                     )

In [None]:
gbcvModel = gbcv.fit(df_train)
print(gbcvModel)

In [None]:
gbcv.getParallelism()

In [None]:
gbcvModel.bestModel.extractParamMap()

# Обучаем модель

In [None]:
lr = LogisticRegression(featuresCol='features', labelCol="target", maxIter=30)
lr_model = lr.fit(df_train)

In [None]:
gbtc = GBTClassifier(labelCol="target", featuresCol='features', maxIter=15)
gbtc_model = gbtc.fit(df_train)

In [None]:
rf = RandomForestClassifier(labelCol="target", featuresCol='features', numTrees=200, maxDepth=20, maxBins=128)
rf_model = rf.fit(df_train)

In [None]:
features_len = len(df_train.select('features').limit(1).collect()[0][0])

mp = MultilayerPerceptronClassifier(labelCol="target", featuresCol='features', maxIter=300, layers=[features_len, 200, 50, 100, 4, 2])
mp_model = mp.fit(df_train)

# Собираем набор для тестирования

In [None]:
df_test = df_user_test

In [None]:
# добавляем фичи жанра
df_test = (df_test
            .join(df_user_genres, df_test.user_id == df_user_genres.user_id, 'inner')
            .join(df_items_genres, df_test.item_id == df_items_genres.item_id, 'inner')
            .select(
                df_test.user_id,
                df_test.item_id,
                
                df_user_genres.genres_norm_all,
                df_user_genres.genres_norm_purchase,
                df_items_genres.genres_norm
            )
            .coalesce(10)
            .cache()
           )

# df_test.show(5)

In [None]:
# добавляем фичи покупаемости
df_test = (df_test
            .join(df_user_user_stat, df_user_user_stat.user_id == df_test.user_id, 'inner')
            .join(df_user_item_stat, df_user_item_stat.item_id == df_test.item_id, 'inner')          
            .select(
                df_test.user_id,
                df_test.item_id,
                df_test.genres_norm_all,
                df_test.genres_norm_purchase,
                df_test.genres_norm,
                
                df_user_user_stat.user_purchase_rate,
                df_user_item_stat.item_purchase_rate,
            )
            .coalesce(10)
            .cache())

# df_test.show(5)

In [None]:
# добавляем фичи года выпуска
df_test = (df_test
            .join(df_user_year, df_user_year.user_id == df_test.user_id, 'inner')
            .join(df_items_year, df_items_year.item_id == df_test.item_id, 'inner')          
            .select(
                df_test.user_id,
                df_test.item_id,
                df_test.genres_norm_all,
                df_test.genres_norm_purchase,
                df_test.genres_norm,
                df_test.user_purchase_rate,
                df_test.item_purchase_rate,
                
                df_user_year.year_cat_norm.alias('user_year_norm'),
                df_items_year.year_cat_norm.alias('item_year_norm')
            )
            .coalesce(10)
            .cache())

# df_test.show(5)

In [None]:
# добавляем фичи длительности просмотра
df_test = (df_test
            .join(df_user_duration, df_user_duration.user_id == df_test.user_id, 'inner')
            .join(df_item_duration, df_item_duration.item_id == df_test.item_id, 'inner')          
            .select(
                df_test.user_id,
                df_test.item_id,
                df_test.genres_norm_all,
                df_test.genres_norm_purchase,
                df_test.genres_norm,
                df_test.user_purchase_rate,
                df_test.item_purchase_rate,
                df_test.user_year_norm,
                df_test.item_year_norm,
                
                df_user_duration.duration.alias('user_duration'),
                df_item_duration.duration.alias('item_duration')
            )
            .coalesce(10)
            .cache())

# df_train.show(2)

In [None]:
# объединяем все фичи
assembler = VectorAssembler(
    inputCols=[
        "genres_norm_all", 
        "genres_norm_purchase", 
        "genres_norm", 
        "user_purchase_rate", "item_purchase_rate",
        "user_year_norm", "item_year_norm",
        "user_duration", "item_duration",
    ], 
    outputCol="features"
)

df_test = assembler.transform(df_test).select("user_id", "item_id", "features")
df_test.show(2)

In [None]:
# predictions = lr_model.transform(df_test)
# predictions = gbtc_model.transform(df_test)
# predictions = gbcvModel.transform(df_test)
predictions = rf_model.transform(df_test)
# predictions = mp_model.transform(df_test)

In [None]:
# print(df_user_test.count())
# print(predictions.count())
# predictions.show(5)

In [None]:
# predictions = (df_user_test
#                 .join(predictions, 
#                       (df_user_test.user_id == predictions.user_id) & (df_user_test.item_id == predictions.item_id), 
#                       'left')
#                 .select(df_user_test.user_id, df_user_test.item_id, predictions.prediction.alias("purchase"))
#                 .fillna(value=0.0, subset=["purchase"])
#                 .coalesce(10)
#                 .cache()
#                )

# print(df_user_test.count())
# print(predictions.count())
# predictions.show(5)

(
    predictions
    .select(predictions.user_id, predictions.item_id, predictions.prediction.alias("purchase"))
    .sort("user_id", "item_id")
    .toPandas()
    .to_csv('lab03.csv')
)

In [None]:
spark.stop()