In [22]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [23]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Transformers and Estimators

### Transformers - Tokenizer

In [24]:
from pyspark.ml.feature import Tokenizer

In [25]:
sentenceDataFrame = sqlc.createDataFrame([(0, "Hi I heard about Spark"),
                                          (1, "I wish Java could use case classes"),
                                          (2, "Logistic,regression,models,are,neat")]) \
                    .toDF("label", "sentence")

In [26]:
tokenizer = Tokenizer().setInputCol("sentence").setOutputCol("words")
tokenized = tokenizer.transform(sentenceDataFrame)

In [27]:
tokenized.toPandas()

Unnamed: 0,label,sentence,words
0,0,Hi I heard about Spark,"[hi, i, heard, about, spark]"
1,1,I wish Java could use case classes,"[i, wish, java, could, use, case, classes]"
2,2,"Logistic,regression,models,are,neat","[logistic,regression,models,are,neat]"


### Transformers - Vector Assembler

In [28]:
from pyspark.sql.functions import rand, randn
from pyspark.ml.feature import VectorAssembler

dfRandom = sqlc.range(0, 10).select("id") \
            .withColumn("uniform", rand(10)) \
            .withColumn("normal1", randn(10)) \
            .withColumn("normal2", randn(11))
dfRandom.show()

+---+-------------------+--------------------+-------------------+
| id|            uniform|             normal1|            normal2|
+---+-------------------+--------------------+-------------------+
|  0|0.41371264720975787| -0.5877482396744728| -0.256535324205377|
|  1| 0.1982919638208397|  -0.256535324205377| -0.506853671746243|
|  2|0.12714181165849525|-0.31703264334668824| 1.4250903895905769|
|  3|0.12030715258495939|  -0.506853671746243|-0.1413699193557902|
|  4|0.12131363910425985|  1.4250903895905769| 0.9657665088756656|
|  5|0.44292918521277047| -0.1413699193557902|-0.7265875219949972|
|  6| 0.2731073068483362| -0.7265875219949972|-1.1985385526188836|
|  7| 0.7784518091224375|  0.8916973357535916|  -1.49887708058667|
|  8|   0.87079354700073| -1.1985385526188836|-0.1171109260011378|
|  9| 0.8729462507631428|   -1.49887708058667|-0.2233716065707664|
+---+-------------------+--------------------+-------------------+



In [30]:
assembler = VectorAssembler(inputCols = ["uniform","normal1","normal2"], outputCol = "features")

dfVec = assembler.transform(dfRandom)

In [42]:
dfVec.select("id","features")

AttributeError: 'NoneType' object has no attribute '_jvm'

### Estimator - Logistic Regression

In [13]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors

In [15]:
training = sqlc.createDataFrame([(1.0, Vectors.dense(0.0, 1.1, 0.1)),
                                       (0.0, Vectors.dense(2.0, 1.0, -1.0)),
                                       (0.0, Vectors.dense(2.0, 1.3, 1.0)),
                                       (1.0, Vectors.dense(0.0, 1.2, -0.5))]) \
            .toDF("label", "features")
training.show()    

+-----+--------------+
|label|      features|
+-----+--------------+
|  1.0| [0.0,1.1,0.1]|
|  0.0|[2.0,1.0,-1.0]|
|  0.0| [2.0,1.3,1.0]|
|  1.0|[0.0,1.2,-0.5]|
+-----+--------------+



In [40]:
lr = LogisticRegression()
lr.setFeaturesCol('features').setLabelCol('label')

LogisticRegression_40979fd39c6f6333ded4

In [33]:
lr.extractParamMap()

{Param(parent='LogisticRegression_422ca81481b7c318dd8c', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='LogisticRegression_422ca81481b7c318dd8c', name='standardization', doc='whether to standardize the training features before fitting the model.'): True,
 Param(parent='LogisticRegression_422ca81481b7c318dd8c', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LogisticRegression_422ca81481b7c318dd8c', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
 Param(parent='LogisticRegression_422ca81481b7c318dd8c', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',
 Param(parent='LogisticRegression_422ca81481b7c318dd8c', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated proba

In [34]:
{p[0].name: p[1] for p in list(lr.extractParamMap().items())}

{'aggregationDepth': 2,
 'standardization': True,
 'fitIntercept': True,
 'elasticNetParam': 0.0,
 'rawPredictionCol': 'rawPrediction',
 'probabilityCol': 'probability',
 'predictionCol': 'prediction',
 'labelCol': 'label',
 'featuresCol': 'features',
 'maxIter': 10,
 'regParam': 0.01,
 'tol': 1e-06,
 'threshold': 0.5,
 'family': 'auto'}

In [17]:
lr.setMaxIter(10).setRegParam(0.01)

LogisticRegression_422ca81481b7c318dd8c

In [18]:
model1 = lr.fit(training, {'maxIter': 10, 'regParam': 0.01})
model1.coefficients

DenseVector([-3.1009, 2.6082, -0.3802])

In [35]:
{p[0].name: p[1] for p in list(lr.extractParamMap().items())}

{'aggregationDepth': 2,
 'standardization': True,
 'fitIntercept': True,
 'elasticNetParam': 0.0,
 'rawPredictionCol': 'rawPrediction',
 'probabilityCol': 'probability',
 'predictionCol': 'prediction',
 'labelCol': 'label',
 'featuresCol': 'features',
 'maxIter': 10,
 'regParam': 0.01,
 'tol': 1e-06,
 'threshold': 0.5,
 'family': 'auto'}

In [19]:
model1.transform(training).toPandas()

Unnamed: 0,label,features,rawPrediction,probability,prediction
0,1.0,"[0.0, 1.1, 0.1]","[-2.89919489464, 2.89919489464]","[0.052193376663, 0.947806623337]",1.0
1,0.0,"[2.0, 1.0, -1.0]","[3.14530074644, -3.14530074644]","[0.95872315829, 0.04127684171]",0.0
2,0.0,"[2.0, 1.3, 1.0]","[3.12319457003, -3.12319457003]","[0.95783942353, 0.0421605764704]",0.0
3,1.0,"[0.0, 1.2, -0.5]","[-3.388123842, 3.388123842]","[0.0326686926626, 0.967331307337]",1.0


In [41]:
sc.stop()