In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import  BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer, NGram, VectorAssembler, ChiSqSelector, StopWordsRemover

df = sqlContext.read.format('csv').options(delimiter='\t', header='true', inferSchema='true').load('/home/jluis2/datosbi01.csv')
df.count()

66410

In [6]:
%%time
(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01])
def build_ngrams_wocs(inputCol=["review","label"], n=2):
    tokenizer = [Tokenizer(inputCol="review", outputCol="words")]
    stopwordsRemover = [StopWordsRemover(inputCol="words", outputCol="filtered")]
    ngrams = [
        NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]
    return  Pipeline(stages=tokenizer + stopwordsRemover + ngrams)

theData = build_ngrams_wocs().fit(train_set)
hashtf = HashingTF(numFeatures=30000, inputCol="filtered", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5)
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "labels")
lsvc = LinearSVC(tol=0.0001, maxIter=8)
pipeline = Pipeline(stages=[theData, hashtf, idf, label_stringIdx, lsvc])

paramGrid = ParamGridBuilder().addGrid(lsvc.regParam, [0.05]).build()
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)  
pipelineFit = crossval.fit(train_set)
predictions = pipelineFit.transform(train_set)

CPU times: user 507 ms, sys: 127 ms, total: 634 ms
Wall time: 2min 11s


In [7]:
chk = pipelineFit.avgMetrics
print("AVG Accuracy:", chk)

evaluator = RegressionEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="mse")
mse = evaluator.evaluate(predictions)
print("Mean Square Error (MSE):", mse)

('AVG Accuracy:', [0.910536914432777])
('Mean Square Error (MSE):', 0.09390176430810844)
