In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf, SQLContext
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Denis.Ivashchenko_lab04")
conf.set('spark.executor.instances', '3')

spark = SparkSession\
        .builder\
        .config(conf=conf)\
        .getOrCreate()

sqlContext = SQLContext(spark)

In [3]:
! hdfs dfs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2021-02-27 22:13 /labs/slaba04/gender_age_dataset.txt


## Train stage

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, DoubleType, FloatType, BinaryType, TimestampType
import pyspark.sql.functions as f

In [5]:
schema = StructType([
    StructField("gender", StringType(),True),
    StructField("age", StringType(),True),
    StructField("uid", StringType(),True),
    StructField("user_json", StringType(),True) 
])

In [6]:
df = spark.read.csv("/labs/slaba04/gender_age_dataset.txt", schema=schema, sep='\t', header=True) 

## PIpeline

In [7]:
from pyspark.ml.feature import CountVectorizer,VectorAssembler,VectorIndexer,StringIndexer,StringIndexerModel,HashingTF,PCA
from pyspark.ml.linalg import Vectors, DenseVector

In [8]:
from pyspark.ml import Transformer
from pyspark.ml import Estimator
from pyspark.ml import Model,Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.evaluation import Evaluator, MulticlassClassificationEvaluator

In [9]:
class DagTransformer(Transformer):
    """Column transformer.
    
    Given 2 Columns  'Age' , "Gender" adds an indexed Column 'Dag' which codes Age + Gender
    """
    def __init__(self):
        self.dag = {}
        self.uid = '142345'
    
    def _transform(self, dataset):
        dataset = dataset.filter(f.col('age') != '-')
        lages = dataset.select('age').filter(f.col('age') != '-').distinct().collect()
        self.dag = dict(zip(([(lambda x,y: (x.age,y))(i,k) for i in lages for k in ['F','M']]),range(10)))
        dataset = dataset\
            .withColumn('ag',f.udf(lambda x,y: self.dag[(x,y)], IntegerType())(f.col('age'),f.col('gender')))
        return dataset


In [10]:
dataset = df.filter(f.col('age') != '-')
lages = df.select('age').filter(f.col('age') != '-').distinct().collect()
dag = dict(zip(([(lambda x,y: (x.age,y))(i,k) for i in lages for k in ['F','M']]),range(10)))

In [11]:
class VisitTransformer(Transformer):
    """Transforms column with visit logs 
    
    Parses each row of the column and results with column of array of url's
    
    Adds column visits.
    """
    
    def _transform(self, dataset):
        dataset = dataset.withColumn('visits',sub_url(f.col('user_json')))
        dataset = dataset.withColumn('visits',vis_uni(vis_net(f.col('visits'))))
        #dataset = dataset.withColumn('visits',f.explode('visits')).drop('user_json')
        return dataset

In [12]:
import re
import urllib
regexp = re.compile(r'(?<=url": ")[^"]+')

@f.udf(ArrayType(StringType()))
def sub_url(a):
    return regexp.findall(a)

@f.udf(ArrayType(StringType()))
def vis_net(a): 
    return [(lambda x: urllib.parse.urlparse(x).netloc)(i) for i in a]


@f.udf(ArrayType(StringType()))
def vis_uni(a):
    return list(dict(zip(a,range(len(a)))).keys())

In [13]:
dfs = df.randomSplit([1.0,4.0],123)
df_test = dfs[0]
df_train = dfs[1]

In [14]:
df = DagTransformer().transform(df_train)
df = VisitTransformer().transform(df)
cv = CountVectorizer(minTF=1, minDF=20, vocabSize=5000, inputCol='visits',outputCol='features')
cv_model = cv.fit(df)
df = cv_model.transform(df)
pca = PCA(k=15, inputCol='features',outputCol='pca_features')
pca_model = pca.fit(df)
df = pca_model.transform(df)

In [15]:
dt = DecisionTreeClassifier(featuresCol='pca_features', labelCol='ag')
dt_model = dt.fit(df)

In [None]:
#Alternative Model
#lr = LogisticRegression(labelCol='ag',featuresCol='pca_features', family='multinomial', maxIter=15, regParam=0.005, elasticNetParam=0.05)
#lr_model = lr.fit(df)

In [16]:
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='ag')

In [17]:

df_test = DagTransformer().transform(df_test)
df_test = VisitTransformer().transform(df_test)
df_test = cv_model.transform(df_test)
df_test = pca_model.transform(df_test)
df_test = dt_model.transform(df_test)


In [18]:
evaluator.evaluate(df_test)

0.26106934001670845

In [19]:
schema_test = StructType([ StructField("key", StringType(), True),
                           StructField("value", StringType(), True),
                           StructField("topic", StringType(), True),
                           StructField("partition", IntegerType(), True),
                           StructField("offset", IntegerType(), True),
                           StructField("timestamp", TimestampType(), True),
                           StructField("timetype", IntegerType(), True)
                         ])

In [20]:
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_denis.ivashchenko",
    "failOnDataLoss": 'False',
    #"startingOffsets": """{"input_denis.ivashchenko":{"0": 50000}}"""
    #"checkpointLocation": "streaming/chk/chk_kafka",
    "startingOffsets": "latest"
}

inputDF = (
  spark
    .readStream
    .format('kafka')
    .options(**read_kafka_params)
    .load()
)

inp = inputDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)","topic","partition","offset","timestamp","timestampType")

In [21]:
json_struct = StructType([
    StructField('uid', StringType()),
    StructField('visits', StringType())
])

In [22]:
def proc_batch( df , epoch_id):
    sdf = df.withColumn('value', f.from_json(f.col('value'), json_struct, { 'multiLine': 'True', 'allowBackslash-EscapingAnyCharacter' : 'True'}))
    sdf = sdf.select(f.col('value'))\
        .withColumn('uids',f.udf(lambda x: x[0])(f.col('value')))\
        .withColumn('user_json',f.udf(lambda x: x[1])(f.col('value')))\
        .drop('value')   
    sdf = VisitTransformer().transform(sdf)
    sdf = cv_model.transform(sdf)
    sdf = pca_model.transform(sdf)
    sdf = dt_model.transform(sdf)
    sdf = sdf.withColumn('gender', f.udf(lambda x: list(dag.keys())[x][1])(f.col('prediction').cast('int')))\
            .withColumn('age', f.udf(lambda x: list(dag.keys())[x][0])(f.col('prediction').cast('int')))
    sdf = sdf.withColumnRenamed(existing='uids', new='uid')\
                .select('uid', 'gender' , 'age')
    sdf = sdf.selectExpr("to_json(struct(*)) AS value")
    sdf.write.format('kafka').options(**write_kafka_params).save()
    pass


In [None]:
#sdf = spark.read.csv(path="./lab04/part-00000-590f11f5-c6ee-466f-b629-0ddc62b78359-c000.csv", header=True)

In [23]:
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "denis.ivashchenko"
}
output_f = (
  inp
    .writeStream
    .foreachBatch(proc_batch)
    .option("checkpointLocation", "streaming/chk/chk_kafka")
    .outputMode("append")
    .start()
)

In [30]:
output_f.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [25]:
output_f.lastProgress

In [29]:
output_f.stop()

In [31]:
spark.stop()