In [1]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc =SparkContext()
sqlContext = SQLContext(sc)
from nltk.corpus import stopwords

In [2]:
stop_word_list = stopwords.words('english')
#  to quickly test if a word is not a stop word, use a set:
stop_word_set = set(stop_word_list)
stop_word_set = list(stop_word_set)
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('articles-articles.csv')
drop_list = ['Dates', 'Topic', 'Page']
data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)


+--------+--------------------+
|Category|                Body|
+--------+--------------------+
|politics|WITH THE ARRIVAL ...|
|business|TENS OF THOUSANDS...|
|politics|WASHINGTON  PRESI...|
|business|OMAHA  ELON MUSK ...|
|politics|REUTERS    THE TR...|
+--------+--------------------+
only showing top 5 rows



In [3]:
data.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Body: string (nullable = true)



# Top 20 crime categories:

In [4]:
from pyspark.sql.functions import col
data.groupBy("Category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-------------+-----+
|     Category|count|
+-------------+-----+
|     politics| 1581|
|     business|  972|
|       sports|  545|
|entertainment|  214|
+-------------+-----+



In [5]:
data.groupBy("Body") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|                Body|count|
+--------------------+-----+
|BEVERLY HILLS  CA...|    9|
|INDIANAPOLIS  OKL...|    7|
|NEW YORK  SPECIAL...|    6|
|BEIJING  U S  TRE...|    6|
|WASHINGTON  FPI M...|    6|
|WASHINGTON  PRESI...|    6|
|WE MAY NEVER STOP...|    5|
|PRESIDENT TRUMP S...|    5|
|HERE IS THE APRIL...|    5|
|WANT TO GET THIS ...|    5|
|AUBURN  WASH   PR...|    5|
|TENS OF THOUSANDS...|    5|
|WASHINGTON  U S  ...|    5|
|IT FEELS LIKE KAN...|    4|
|HAVANA  AT CUBA S...|    4|
|WHEN THE NATIONAL...|    4|
|WASHINGTON  TOP T...|    4|
|TEHRAN  DEEP IN T...|    4|
|CARACAS  PRESIDEN...|    4|
|WASHINGTON  THE F...|    4|
+--------------------+-----+
only showing top 20 rows



In [6]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
# stop words
# add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_word_set)
# bag of words count
# countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
# label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
# pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# # Fit the pipeline to training documents.
# pipelineFit = pipeline.fit(data)
# dataset = pipelineFit.transform(data)
# dataset.show(5)

In [8]:
# set seed for reproducibility
# (trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
# print("Training Dataset Count: " + str(trainingData.count()))
# print("Test Dataset Count: " + str(testData.count()))

In [10]:
from pyspark.ml.feature import HashingTF, IDF
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

NameError: name 'StringIndexer' is not defined

In [51]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6626393091863224

In [40]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = trainingData
parsedData = data.map(parsePoint)

# Build the model
model = LogisticRegressionWithLBFGS.train(parsedData)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

# Save and load model
model.save(sc, "target/tmp/pythonLogisticRegressionWithLBFGSModel")
sameModel = LogisticRegressionModel.load(sc,
                                         "target/tmp/pythonLogisticRegressionWithLBFGSModel")

AttributeError: 'DataFrame' object has no attribute 'map'