In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

sc = SparkSession \
    .builder \
    .appName("cluster") \
    .master('local[2]') \
    .getOrCreate()

In [4]:
training = sc.createDataFrame([
     (0, 'i like apple pie for dessert', 1.0),
     (1, 'i dont drive fast cars', 0.0),
     (2, 'data science is fun', 1.0),
     (3, 'chocolate is not my favorite', 0.0),
     (4, 'my favorite movie is predator', 1.0)],
     ['id', 'text', 'label'])

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [6]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.01, featuresCol='features',labelCol='label')

In [7]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [9]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-1.24.0-py3-none-any.whl (16.5 MB)
Collecting waitress
  Downloading waitress-2.1.1-py3-none-any.whl (57 kB)
Collecting databricks-cli>=0.8.7
  Downloading databricks-cli-0.16.4.tar.gz (58 kB)
Collecting gitpython>=2.1.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
Collecting cloudpickle
  Downloading cloudpickle-2.0.0-py3-none-any.whl (25 kB)
Collecting pandas
  Downloading pandas-1.4.1-cp38-cp38-win_amd64.whl (10.6 MB)
Collecting protobuf>=3.7.0
  Downloading protobuf-3.19.4-cp38-cp38-win_amd64.whl (895 kB)
Collecting importlib-metadata!=4.7.0,>=3.7.0
  Downloading importlib_metadata-4.11.3-py3-none-any.whl (18 kB)
Collecting pytz
  Downloading pytz-2022.1-py2.py3-none-any.whl (503 kB)
Collecting alembic
  Downloading alembic-1.7.7-py3-none-any.whl (210 kB)
Collecting sqlalchemy
  Downloading SQLAlchemy-1.4.32-cp38-cp38-win_amd64.whl (1.6 MB)
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp38-cp38-win_amd64.whl (155 kB)
Collecting p

In [10]:
import mlflow         #install mlflow if not installed using '!pip install mlflow'

In [11]:
with mlflow.start_run() as run1:
    mlflow.pyspark.ml.autolog()
    model = pipeline.fit(training)

In [12]:
mlflow.end_run()

In [13]:
sc.stop()