In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 10 --executor-memory 5g --executor-cores 8 --driver-memory 8g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT
from pyspark.ml.feature import *
from pyspark.sql.window import Window
from pyspark.ml.classification import GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.stat import Correlation
from pyspark.ml.recommendation import ALS
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import time
import pandas as pd
from datetime import datetime

spark = SparkSession.builder \
                    .appName("spark-course") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()

spark

# Data transformation

In [6]:
! hdfs dfs -cat /labs/slaba04/gender_age_dataset.txt | head -2

gender	age	uid	user_json
F	18-24	d50192e5-c44e-4ae8-ae7a-7cfe67c8b777	{"visits": [{"url": "http://zebra-zoya.ru/200028-chehol-organayzer-dlja-macbook-11-grid-it.html?utm_campaign=397720794&utm_content=397729344&utm_medium=cpc&utm_source=begun", "timestamp": 1419688144068}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426666298001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426666298000}, {"url": "http://news.yandex.ru/yandsearch?cl4url=chezasite.com/htc/htc-one-m9-delay-86327.html&lr=213&rpt=story", "timestamp": 1426661722001}, {"url": "http://www.sotovik.ru/news/240283-htc-one-m9-zaderzhivaetsja.html", "timestamp": 1426661722000}]}
cat: Unable to write to output stream.


In [7]:
! hdfs dfs -ls /user/roman.razumovskiy

Found 10 items
drwxr-xr-x   - roman.razumovskiy roman.razumovskiy          0 2022-11-04 22:18 /user/roman.razumovskiy/.sparkStaging
drwxr-xr-x   - roman.razumovskiy roman.razumovskiy          0 2022-11-03 01:49 /user/roman.razumovskiy/age_model
drwxr-xr-x   - roman.razumovskiy roman.razumovskiy          0 2022-11-04 18:36 /user/roman.razumovskiy/cp_s_example.parquet
drwxr-xr-x   - roman.razumovskiy roman.razumovskiy          0 2022-10-24 06:46 /user/roman.razumovskiy/gbt_40
drwxr-xr-x   - roman.razumovskiy roman.razumovskiy          0 2022-11-03 01:48 /user/roman.razumovskiy/gender_model
drwxr-xr-x   - roman.razumovskiy roman.razumovskiy          0 2022-11-03 17:54 /user/roman.razumovskiy/kafka_df
drwxr-xr-x   - roman.razumovskiy roman.razumovskiy          0 2022-11-03 17:54 /user/roman.razumovskiy/kafka_df_cp
drwxr-xr-x   - roman.razumovskiy roman.razumovskiy          0 2022-11-03 22:47 /user/roman.razumovskiy/lab05.csv
drwxr-xr-x   - roman.razumovskiy roman.razumovskiy      

In [8]:
gender_age_dataset = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', sep = '\t', header = True)\
.filter(F.col('gender').isin('F','M'))

In [10]:
def get_domain(s):
    try:
        domain = s.split('/')[2]
        if domain in ['http:','https:']:
            domain = s.split('/')[4]
        return domain
    except:
        return ''

get_domain_udf = F.udf(get_domain, StringType())

In [11]:
json_type_1 = MapType(StringType(), ArrayType(StringType()))
json_type_2 = MapType(StringType(), StringType())
clickstream = gender_age_dataset\
.select('gender', 'age', 'uid', F.explode(F.from_json('user_json', json_type_1)['visits']).alias('visit'))\
.withColumn('url', F.from_json('visit', json_type_2)['url'])\
.withColumn('timestamp', F.from_json('visit', json_type_2)['timestamp'].cast('double'))\
.withColumn('url', get_domain_udf('url'))\
.drop('visit').cache()

clickstream.printSchema()
for row in clickstream.limit(5).collect():
    print(row)

root
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- url: string (nullable = true)
 |-- timestamp: double (nullable = true)

Row(gender='F', age='18-24', uid='d50192e5-c44e-4ae8-ae7a-7cfe67c8b777', url='zebra-zoya.ru', timestamp=1419688144068.0)
Row(gender='F', age='18-24', uid='d50192e5-c44e-4ae8-ae7a-7cfe67c8b777', url='news.yandex.ru', timestamp=1426666298001.0)
Row(gender='F', age='18-24', uid='d50192e5-c44e-4ae8-ae7a-7cfe67c8b777', url='www.sotovik.ru', timestamp=1426666298000.0)
Row(gender='F', age='18-24', uid='d50192e5-c44e-4ae8-ae7a-7cfe67c8b777', url='news.yandex.ru', timestamp=1426661722001.0)
Row(gender='F', age='18-24', uid='d50192e5-c44e-4ae8-ae7a-7cfe67c8b777', url='www.sotovik.ru', timestamp=1426661722000.0)


In [54]:
htf = HashingTF(numFeatures = 1000, inputCol = 'url_list', outputCol = 'features')
gender_indexer = StringIndexer(inputCol = 'gender', outputCol = 'gender_target', stringOrderType = 'alphabetAsc')
age_indexer = StringIndexer(inputCol = 'age', outputCol = 'age_target', stringOrderType = 'alphabetAsc')

features_df = clickstream.groupBy('gender','age','uid')\
.agg(F.collect_list('url').alias('url_list'))

pipeline = Pipeline(stages=[htf, gender_indexer, age_indexer])

model = pipeline.fit(features_df)
features_df = model.transform(features_df).drop('url_list').cache()

features_df.show(5)

features_df.groupBy('gender','gender_target').count().orderBy('gender_target').show(10)
features_df.groupBy('age','age_target').count().orderBy('age_target').show(10)

+------+-----+--------------------+--------------------+-------------+----------+
|gender|  age|                 uid|            features|gender_target|age_target|
+------+-----+--------------------+--------------------+-------------+----------+
|     F|18-24|09b1ecd3-b2d2-4c1...|  (1000,[509],[3.0])|          0.0|       0.0|
|     F|18-24|15faf063-5e44-4b6...|(1000,[43,268,293...|          0.0|       0.0|
|     F|18-24|560142d9-6c9c-439...|(1000,[63,200,218...|          0.0|       0.0|
|     F|18-24|6709f443-7ddd-423...|  (1000,[696],[5.0])|          0.0|       0.0|
|     F|18-24|67e9bd68-ef03-49c...|(1000,[402,706,77...|          0.0|       0.0|
+------+-----+--------------------+--------------------+-------------+----------+
only showing top 5 rows

+------+-------------+-----+
|gender|gender_target|count|
+------+-------------+-----+
|     F|          0.0|17440|
|     M|          1.0|18698|
+------+-------------+-----+

+-----+----------+-----+
|  age|age_target|count|
+-----+-----

# Gender model

In [55]:
gender_fractions_dict = {int(i):0.8 for i in range(2)}

In [56]:
gender_train_df = features_df.sampleBy('gender_target', fractions = gender_fractions_dict, seed=5757).coalesce(10).cache()
gender_valid_df = features_df.join(gender_train_df, on= ['uid'] , how='leftanti').coalesce(10).cache()

In [57]:
start_time = time.time()
gbt = GBTClassifier(maxIter = 40, maxDepth = 5, featuresCol = 'features', labelCol = 'gender_target', seed = 42)
gender_model = gbt.fit(gender_train_df)
print(f'{(time.time() - start_time)/60} minutes')

0.8884486198425293 minutes


In [58]:
evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'gender_target', metricName = 'accuracy')
train_predictions = gender_model.transform(gender_train_df)
valid_predictions = gender_model.transform(gender_valid_df)
evaluator.evaluate(train_predictions), evaluator.evaluate(valid_predictions)

(0.6568854213415266, 0.6121879793613164)

In [59]:
gender_model.write().overwrite().save('gender_model')

# Age model

In [60]:
age_fractions_dict = {int(i):0.8 for i in range(5)}

In [61]:
age_train_df = features_df.sampleBy('age_target', fractions = age_fractions_dict, seed=5757).coalesce(10).cache()
age_valid_df = features_df.join(age_train_df, on= ['uid'] , how = 'leftanti').coalesce(10).cache()

In [62]:
start_time = time.time()
lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0, featuresCol = 'features', labelCol = 'age_target')
age_model = lr.fit(age_train_df)
print(f'{(time.time() - start_time)/60} minutes')

0.0368956446647644 minutes


In [63]:
evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'age_target', metricName = 'accuracy')
train_predictions = age_model.transform(age_train_df)
valid_predictions = age_model.transform(age_valid_df)
evaluator.evaluate(train_predictions), evaluator.evaluate(valid_predictions)

(0.4604895225601547, 0.42560312369265096)

In [64]:
age_model.write().overwrite().save('age_model')