<span style="color:red;font-weight:bold">Jayant Solanki</span>,
<span style="color:red;font-weight:bold">Anant Gupta</span>
<hr/>
## <span style="float:left">Lab 3</span>
### <span style="float:right">DATA ANALYTICS PIPELINE USING APACHE SPARK</span>
#### <span style="float:right">Article classifier using PySpark</span>
<hr/>

In [1]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc =SparkContext()
sqlContext = SQLContext(sc)
from nltk.corpus import stopwords

In [2]:
stop_word_list = stopwords.words('english')
#  to quickly test if a word is not a stop word, use a set:
stop_word_set = set(stop_word_list)
stop_word_set = list(stop_word_set)
# Reading the training Data
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('articles-train.csv')
drop_list = ['Dates', 'Topic', 'Page']
data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)


+--------+--------------------+
|Category|                Body|
+--------+--------------------+
|business|SEOUL  WITH A FAL...|
|business|JEFFERSON CITY  M...|
|business|WASHINGTON  THE T...|
|business|REUTERS    METLIF...|
|  sports|DALLAS  WHEN DALL...|
+--------+--------------------+
only showing top 5 rows



In [3]:
data.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Body: string (nullable = true)



# Top 20 crime categories:

In [4]:
from pyspark.sql.functions import col
data.groupBy("Category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-------------+-----+
|     Category|count|
+-------------+-----+
|     business| 2309|
|     politics| 1666|
|       sports|  571|
|entertainment|  464|
+-------------+-----+



In [5]:
data.groupBy("Body") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|                Body|count|
+--------------------+-----+
|HERE IS THE APRIL...|    9|
|BEVERLY HILLS  CA...|    9|
|WANT TO GET THIS ...|    8|
|BEIJING  U S  TRE...|    8|
|JOANNE KIM AND AY...|    7|
|WANT TO GET THIS ...|    7|
|INDIANAPOLIS  IND...|    7|
|WANT TO GET THIS ...|    7|
|WASHINGTON  FPI M...|    7|
|INDIANAPOLIS  OKL...|    7|
|LONDON  A RESURGE...|    7|
|AUBURN  WASH   PR...|    6|
|IT FEELS LIKE KAN...|    6|
|PHOENIX  WHEN FOR...|    6|
|INDIANAPOLIS  IND...|    6|
|LONDON  BRITAIN S...|    6|
|WE MAY NEVER STOP...|    6|
|AT THE 2014 CONSU...|    6|
|HERE ARE THE WEEK...|    6|
|VENTURA  CALIF   ...|    6|
+--------------------+-----+
only showing top 20 rows



In [6]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
# stop words
# add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_word_set)
# bag of words count
# countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
# label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
# pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# # Fit the pipeline to training documents.
# pipelineFit = pipeline.fit(data)
# dataset = pipelineFit.transform(data)
# dataset.show(5)

In [8]:
# set seed for reproducibility
# (trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
# print("Training Dataset Count: " + str(trainingData.count()))
# print("Test Dataset Count: " + str(testData.count()))

In [9]:
from pyspark.ml.feature import HashingTF, IDF
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 324)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)

+------------------------------+-------------+------------------------------+-----+----------+
|                          Body|     Category|                   probability|label|prediction|
+------------------------------+-------------+------------------------------+-----+----------+
|DALIAN MANILA  CHINA PLANS ...|     business|[0.9993406144602194,1.38312...|  0.0|       0.0|
|LONDON FRANKFURT  GERMANY S...|     business|[0.9980119859619365,4.94516...|  0.0|       0.0|
|SYDNEY  AUSTRALIA S BIGGEST...|     business|[0.997918087143643,0.001395...|  0.0|       0.0|
|PARIS  SOCGEN S CHIEF EXECU...|     business|[0.997250963812292,0.001054...|  0.0|       0.0|
|ADEN  YEMEN  THE YOUNG MOTH...|     business|[0.9964283193746383,8.25149...|  0.0|       0.0|
|ADEN  YEMEN  THE YOUNG MOTH...|     business|[0.9964283193746383,8.25149...|  0.0|       0.0|
|HONG KONG  A GADGET MAKER  ...|     business|[0.9952530049599432,0.00187...|  0.0|       0.0|
|HONG KONG  A GADGET MAKER  ...|     business|[0.9

In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6566287185364578

In [20]:
test = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('articles-test.csv')
drop_list = ['Dates', 'Topic', 'Page']
test = data.select([column for column in data.columns if column not in drop_list])
test.show(5)

+-------------+--------------------+
|     Category|                Body|
+-------------+--------------------+
|     business|CAIRO  EGYPT WILL...|
|     business|WANT TO GET THIS ...|
|     politics|WASHINGTON  U S  ...|
|     business|WITH THE ARRIVAL ...|
|entertainment|ON THURSDAY  SOON...|
+-------------+--------------------+
only showing top 5 rows



In [21]:
from pyspark.sql.functions import col
test.groupBy("Category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-------------+-----+
|     Category|count|
+-------------+-----+
|     business|  435|
|     politics|  352|
|       sports|  123|
|entertainment|   90|
+-------------+-----+



In [22]:
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(test)

In [23]:
testset = pipelineFit.transform(test)

In [24]:
predictions = lrModel.transform(testset)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Body|Category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|LONDON  LLOYD S OF LONDON  ...|business|[0.9987644291294391,5.51420...|  0.0|       0.0|
|NEW YORK  BILLIONAIRE INVES...|politics|[0.9971454845961765,8.76934...|  1.0|       0.0|
|NEW YORK  BILLIONAIRE INVES...|business|[0.9971454845961765,8.76934...|  0.0|       0.0|
|ATHENS PATRAS  GREECE  GREE...|business|[0.9956880840988583,0.00165...|  0.0|       0.0|
|LPC    THE SIZE OF FUND FIN...|business|[0.9938978451651596,0.00168...|  0.0|       0.0|
|SAN FRANCISCO  MOST BIG BAN...|business|[0.9932966005974454,0.00251...|  0.0|       0.0|
|LOS ANGELES LONDON  SWISS B...|business|[0.9931797480241308,0.00275...|  0.0|       0.0|
|LONDON  A RESURGENT DOLLAR ...|business|[0.991502433831096,0.006527...|  0.0|       0.0|
|FRANKFURT

In [25]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6517946810259405

#Naive Bayes


In [29]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Body|Category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|MINNEAPOLIS  AN INFLUENTIAL...|politics|[1.0,5.330226441107558E-17,...|  1.0|       0.0|
|BEIJING  CHINA WOULD WELCOM...|business|[1.0,4.159153239216394E-17,...|  0.0|       0.0|
|WASHINGTON  THE U S  INTERN...|business|[1.0,2.0264435657261993E-17...|  0.0|       0.0|
|WHEN THE NATIONAL LABOR REL...|politics|[1.0,3.589415387489051E-18,...|  1.0|       0.0|
|DUBAI PARIS ABU DHABI  AIRB...|business|[1.0,8.86738023340102E-19,3...|  0.0|       0.0|
|PARIS  IF THERE S ONE INDUS...|business|[1.0,6.724641323965758E-19,...|  0.0|       0.0|
|PARIS  IF THERE S ONE INDUS...|business|[1.0,6.724641323965758E-19,...|  0.0|       0.0|
|BERLIN  INVESTMENT ADVISORY...|business|[1.0,6.363486360425474E-19,...|  0.0|       0.0|
|DETROIT  

In [30]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7143992853147192

In [34]:
predictions = model.transform(testset)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Body|Category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|REUTERS    GREENLAND PRIME ...|politics|[1.0,1.1043690275376262E-16...|  1.0|       0.0|
|MONTREAL  PARIS  BOMBARDIER...|business|[1.0,9.541448730737906E-17,...|  0.0|       0.0|
|WASHINGTON  THE INTERNATION...|politics|[1.0,4.363544965476946E-17,...|  1.0|       0.0|
|BERLIN  GERMANY PLANS TO MA...|business|[1.0,2.910375119020616E-17,...|  0.0|       0.0|
|BEIJING  CHINA WOULD WELCOM...|politics|[1.0,2.650632099357502E-17,...|  1.0|       0.0|
|AS SCOTT PRUITT  THE ADMINI...|politics|[1.0,1.3275905013933747E-17...|  1.0|       0.0|
|LONDON  BRITAIN S DEBT MANA...|business|[1.0,4.999388556309498E-19,...|  0.0|       0.0|
|NEW YORK  U S  MORTGAGE APP...|business|[1.0,1.789302779449396E-19,...|  0.0|       0.0|
|WHEN THE 

In [35]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7178706140460119

In [36]:
#randomforest

In [43]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+-------------+------------------------------+-----+----------+
|                          Body|     Category|                   probability|label|prediction|
+------------------------------+-------------+------------------------------+-----+----------+
|PARIS  SOCGEN S CHIEF EXECU...|     business|[0.6395203743878681,0.19570...|  0.0|       0.0|
|REUTERS    SHARES OF APPLE ...|entertainment|[0.6383906429957138,0.18815...|  3.0|       0.0|
|REUTERS    SHARES OF APPLE ...|     business|[0.6383906429957138,0.18815...|  0.0|       0.0|
|BEIJING HONG KONG  CHINESE ...|     business|[0.6274841672236805,0.20503...|  0.0|       0.0|
|FRANKFURT  SWISS RE SAID IT...|     business|[0.6252033464609259,0.20659...|  0.0|       0.0|
|FRANKFURT  SWISS RE SAID IT...|     business|[0.6252033464609259,0.20659...|  0.0|       0.0|
|REUTERS    SHARES IN DANISH...|     business|[0.6250340340482012,0.20402...|  0.0|       0.0|
|REUTERS    YUM BRANDS INC S...|     business|[0.6

In [44]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.47682612604862123

In [45]:
predictions = rfModel.transform(testset)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Body|Category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|REUTERS    MASTERCARD INC T...|business|[0.6288462224263655,0.21043...|  0.0|       0.0|
|REUTERS    APPLE INC ON TUE...|business|[0.6201217115354624,0.19553...|  0.0|       0.0|
|REUTERS    APPLE INC ON TUE...|politics|[0.6201217115354624,0.19553...|  1.0|       0.0|
|NEW YORK  THE THREE MAJOR U...|business|[0.6188177527255668,0.19894...|  0.0|       0.0|
|HONG KONG  PING AN HEALTHCA...|business|[0.6177502212212742,0.20599...|  0.0|       0.0|
|FRANKFURT BERLIN  GERMANY S...|business|[0.6158418201280498,0.21677...|  0.0|       0.0|
|REUTERS    BRITAIN S INDIVI...|business|[0.6149627807412995,0.21186...|  0.0|       0.0|
|REUTERS    BRITAIN S INDIVI...|business|[0.6149627807412995,0.21186...|  0.0|       0.0|
|DRUGMAKER

In [46]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.47471800205224224