In [None]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [None]:
spark

### В этом задании нужно будет создать кастомный RegexTransformer. Это трансформер, у которого есть входной и выходной столбец, а также параметр `regex`, который определяет, какому шаблону должны удовлетворять слова во входном столбце, чтобы поместить их в выходной. В выходном столбце, в итоге, должен оказаться `Array` подходящих слов 

[Transformer base class](https://github.com/apache/spark/blob/v2.4.0/python/pyspark/ml/base.py#L139)

### Создайте датафрейм и протестируйте ваш трансформер с несколькими возможными регэкспами

In [None]:
# Ваш код здесь

In [None]:
import re

from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark import keyword_only

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType

In [None]:
import pandas as pd

In [None]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType(fields=[
    StructField("text", StringType()),
])

df = spark.createDataFrame([["Hello, world"]], schema=schema)

In [None]:
df.show()

In [None]:
class RegexTransformer(Transformer, HasInputCol, HasOutputCol):
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(RegexTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        return dataset.withColumn(self.getOutputCol(), F.col(self.getInputCol()))

In [None]:
tokenizer = RegexTransformer(inputCol="text", outputCol="tokens")

In [None]:
tokenizer.transform(df).show()

In [None]:
class RegexTransformer(Transformer, HasInputCol, HasOutputCol):
    
    regex = Param(Params._dummy(), "regex",
                  "Python regular expression to match tokens",
                  typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, regex="\w+"):
        super(RegexTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
        self._set(regex=regex)
            
    def setRegex(self):
        self._set(regex=regex)
            
    def _transform(self, dataset):
        return dataset.withColumn(self.getOutputCol(), F.col(self.getInputCol()))

In [None]:
tokenizer = RegexTransformer(inputCol="text", outputCol="tokens", regex="\w+")

In [None]:
tokenizer.transform(df).show()

In [None]:
from functools import partial

def tokenize(series, regex):
    return series.str.findall(regex)

f = partial(tokenize, regex="\w+")

In [None]:
tokenize_udf = F.pandas_udf(partial(tokenize, regex="\w+"), returnType=ArrayType(StringType()))

df.select(tokenize_udf("text")).show()

In [None]:
class RegexTransformer(Transformer, HasInputCol, HasOutputCol):
    
    regex = Param(Params._dummy(), "regex",
                  "Python regular expression to match tokens",
                  typeConverter=TypeConverters.toString)
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, regex="\w+"):
        super(RegexTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
        self._set(regex=regex)
            
    def setRegex(self):
        self._set(regex=regex)
            
    def _transform(self, dataset):
        pattern = re.compile(self.getOrDefault("regex"))
        tokenize_udf = F.pandas_udf(partial(tokenize, regex=pattern), returnType=ArrayType(StringType()))
        return dataset.withColumn(self.getOutputCol(), tokenize_udf(self.getInputCol()))

In [None]:
tokenizer = RegexTransformer(inputCol="text", outputCol="tokens", regex="\w+")

In [None]:
df = spark.createDataFrame([["Hello, world"]], schema=["text"])

In [None]:
tokenizer.transform(df).show()

In [None]:
spark.stop()