<span style="color:red;font-weight:bold">Jayant Solanki</span>,
<span style="color:red;font-weight:bold">Anant Gupta</span>
<hr/>
## <span style="float:left">Lab 3</span>
### <span style="float:right">DATA ANALYTICS PIPELINE USING APACHE SPARK</span>
#### <span style="float:right">Article classifier using PySpark</span>
<hr/>

In [2]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc =SparkContext()
sqlContext = SQLContext(sc)
from nltk.corpus import stopwords
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.sql.functions import col
#LR
from pyspark.ml.classification import LogisticRegression
#NB
from pyspark.ml.classification import NaiveBayes
#RF
from pyspark.ml.classification import RandomForestClassifier

In [3]:
stop_word_list = stopwords.words('english')
#  to quickly test if a word is not a stop word, use a set:
stop_word_set = set(stop_word_list)
stop_word_set = list(stop_word_set)
# Reading the training Data
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('articles-train.csv')
drop_list = ['Dates', 'Topic', 'Page']
data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)


+--------+--------------------+
|Category|                Body|
+--------+--------------------+
|business|SEOUL  WITH A FAL...|
|business|JEFFERSON CITY  M...|
|business|WASHINGTON  THE T...|
|business|REUTERS    METLIF...|
|  sports|DALLAS  WHEN DALL...|
+--------+--------------------+
only showing top 5 rows



In [4]:
data.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Body: string (nullable = true)



# Top 4 Article categories:

In [5]:

data.groupBy("Category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-------------+-----+
|     Category|count|
+-------------+-----+
|     business| 2309|
|     politics| 1666|
|       sports|  571|
|entertainment|  464|
+-------------+-----+



# Showing top 20 frequent articles:

In [6]:
data.groupBy("Body") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|                Body|count|
+--------------------+-----+
|HERE IS THE APRIL...|    9|
|BEVERLY HILLS  CA...|    9|
|BEIJING  U S  TRE...|    8|
|WANT TO GET THIS ...|    8|
|JOANNE KIM AND AY...|    7|
|WANT TO GET THIS ...|    7|
|INDIANAPOLIS  IND...|    7|
|INDIANAPOLIS  OKL...|    7|
|WANT TO GET THIS ...|    7|
|WASHINGTON  FPI M...|    7|
|LONDON  A RESURGE...|    7|
|WASHINGTON  A LOW...|    6|
|COPENHAGEN  AT FI...|    6|
|PHOENIX  WHEN FOR...|    6|
|HERE ARE THE WEEK...|    6|
|VENTURA  CALIF   ...|    6|
|WE MAY NEVER STOP...|    6|
|INDIANAPOLIS  IND...|    6|
|LONDON  BRITAIN S...|    6|
|IT FEELS LIKE KAN...|    6|
+--------------------+-----+
only showing top 20 rows



In [7]:

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
# stop words
#using ntlk stopwords
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_word_set)

In [8]:
#convertng string category to integer labels
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
#creating internal test data by partitioning the training data, 80% and 20%
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 324)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)

+------------------------------+-------------+------------------------------+-----+----------+
|                          Body|     Category|                   probability|label|prediction|
+------------------------------+-------------+------------------------------+-----+----------+
|DALIAN MANILA  CHINA PLANS ...|     business|[0.9993406144602194,1.38312...|  0.0|       0.0|
|LONDON FRANKFURT  GERMANY S...|     business|[0.9980119859619365,4.94516...|  0.0|       0.0|
|SYDNEY  AUSTRALIA S BIGGEST...|     business|[0.997918087143643,0.001395...|  0.0|       0.0|
|PARIS  SOCGEN S CHIEF EXECU...|     business|[0.9972509638122917,0.00105...|  0.0|       0.0|
|ADEN  YEMEN  THE YOUNG MOTH...|     business|[0.9964283193746383,8.25149...|  0.0|       0.0|
|ADEN  YEMEN  THE YOUNG MOTH...|     business|[0.9964283193746383,8.25149...|  0.0|       0.0|
|HONG KONG  A GADGET MAKER  ...|     business|[0.9952530049599436,0.00187...|  0.0|       0.0|
|HONG KONG  A GADGET MAKER  ...|entertainment|[0.9

In [9]:
#printing the accuracy for test data, internal partition

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print(str(evaluator.evaluate(predictions)*100)+"%")


65.66287185364578%


In [10]:
## Getting Test Data

In [11]:
test = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('articles-test.csv')
drop_list = ['Dates', 'Topic', 'Page']
test = data.select([column for column in data.columns if column not in drop_list])
test.show(5)

+--------+--------------------+
|Category|                Body|
+--------+--------------------+
|business|SEOUL  WITH A FAL...|
|business|JEFFERSON CITY  M...|
|business|WASHINGTON  THE T...|
|business|REUTERS    METLIF...|
|  sports|DALLAS  WHEN DALL...|
+--------+--------------------+
only showing top 5 rows



In [12]:

# test.groupBy("Category") \
#     .count() \
#     .orderBy(col("count").desc()) \
#     .show()

In [13]:
#creating pipeline for test data
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(test)

In [14]:
testset = pipelineFit.transform(test)

# performing the prediction on test data

In [15]:
predictions = lrModel.transform(testset)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Body|Category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|DALIAN MANILA  CHINA PLANS ...|business|[0.9993406144602194,1.38312...|  0.0|       0.0|
|DALIAN MANILA  CHINA PLANS ...|business|[0.9993406144602194,1.38312...|  0.0|       0.0|
|LONDON  LLOYD S OF LONDON  ...|business|[0.9993012765310461,2.97857...|  0.0|       0.0|
|LONDON  LLOYD S OF LONDON  ...|business|[0.9990994852983173,4.17553...|  0.0|       0.0|
|LONDON  LLOYD S OF LONDON  ...|business|[0.9990994852983173,4.17553...|  0.0|       0.0|
|LONDON  SAINSBURY S  BILLIO...|business|[0.9989058826539344,2.26016...|  0.0|       0.0|
|LONDON  SAINSBURY S  BILLIO...|business|[0.9989058826539344,2.26016...|  0.0|       0.0|
|LONDON  IF SAINSBURY S IS T...|business|[0.9986346217462178,4.92853...|  0.0|       0.0|
|LONDON  I

In [16]:
#printing accuracy for test data
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print(str(evaluator.evaluate(predictions)*100)+"%")

79.79084922846847%


# Naive Bayes


In [17]:
#training the naive bayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Body|Category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|MINNEAPOLIS  AN INFLUENTIAL...|politics|[1.0,5.330226441107558E-17,...|  1.0|       0.0|
|BEIJING  CHINA WOULD WELCOM...|business|[1.0,4.159153239216394E-17,...|  0.0|       0.0|
|WASHINGTON  THE U S  INTERN...|business|[1.0,2.0264435657261993E-17...|  0.0|       0.0|
|WHEN THE NATIONAL LABOR REL...|politics|[1.0,3.589415387489051E-18,...|  1.0|       0.0|
|DUBAI PARIS ABU DHABI  AIRB...|business|[1.0,8.86738023340102E-19,3...|  0.0|       0.0|
|PARIS  IF THERE S ONE INDUS...|business|[1.0,6.724641323965758E-19,...|  0.0|       0.0|
|PARIS  IF THERE S ONE INDUS...|business|[1.0,6.724641323965758E-19,...|  0.0|       0.0|
|BERLIN  INVESTMENT ADVISORY...|business|[1.0,6.363486360425474E-19,...|  0.0|       0.0|
|DETROIT  

In [18]:
#printing the accuracy for test data, internal partition
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print(str(evaluator.evaluate(predictions)*100)+"%")

71.43992853147192%


In [19]:
#printing accuracy for test data

In [20]:
predictions = model.transform(testset)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Body|Category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|MUSEUMS HAVE LONG CONSIDERE...|business|[1.0,6.299709301157002E-17,...|  0.0|       0.0|
|MINNEAPOLIS  AN INFLUENTIAL...|politics|[1.0,5.330226441107558E-17,...|  1.0|       0.0|
|BEIJING  CHINA WOULD WELCOM...|business|[1.0,4.159153239216394E-17,...|  0.0|       0.0|
|WASHINGTON  JPMORGAN CHASE ...|business|[1.0,3.687596167915898E-17,...|  0.0|       0.0|
|WASHINGTON  JPMORGAN CHASE ...|politics|[1.0,3.687596167915898E-17,...|  1.0|       0.0|
|NAIROBI  UNIDENTIFIED GUNME...|business|[1.0,3.398088016330037E-17,...|  0.0|       0.0|
|PHOENIX  STRIKING ARIZONA T...|business|[1.0,3.2075171893610285E-17...|  0.0|       0.0|
|PHOENIX  STRIKING ARIZONA T...|politics|[1.0,3.2075171893610285E-17...|  1.0|       0.0|
|WASHINGTO

# performing the prediction on test data

In [21]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print(str(evaluator.evaluate(predictions)*100)+"%")

76.94682115429076%


# randomforest

In [22]:

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+-------------+------------------------------+-----+----------+
|                          Body|     Category|                   probability|label|prediction|
+------------------------------+-------------+------------------------------+-----+----------+
|REUTERS    YUM BRANDS INC S...|     business|[0.6045200571578614,0.22972...|  0.0|       0.0|
|PARIS  BNP PARIBAS  FRANCE ...|     business|[0.5990675770405907,0.22939...|  0.0|       0.0|
|REUTERS    SHARES OF APPLE ...|entertainment|[0.5976971546105987,0.22424...|  3.0|       0.0|
|REUTERS    SHARES OF APPLE ...|     business|[0.5976971546105987,0.22424...|  0.0|       0.0|
|REUTERS    SHARES IN DANISH...|     business|[0.5941670499121319,0.23042...|  0.0|       0.0|
|LOS ANGELES LONDON  SWISS B...|     business|[0.5932977212637036,0.24163...|  0.0|       0.0|
|LOS ANGELES  SPROUTS FARMER...|     business|[0.5923558071274136,0.23751...|  0.0|       0.0|
|SAN FRANCISCO  SINCE APPLE ...|     business|[0.5

In [23]:
#printing the accuracy for test data, internal partition
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print(str(evaluator.evaluate(predictions)*100)+"%")

48.32796582928188%


# performing the prediction on test data

In [24]:
predictions = rfModel.transform(testset)
predictions.filter(predictions['prediction'] == 0) \
    .select("Body","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          Body|Category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|HONG KONG LONDON  STANDARD ...|business|[0.6257023876531506,0.20996...|  0.0|       0.0|
|HONG KONG LONDON  STANDARD ...|business|[0.6257023876531506,0.20996...|  0.0|       0.0|
|HONG KONG LONDON  STANDARD ...|business|[0.6257023876531506,0.20996...|  0.0|       0.0|
|LONDON  A RESURGENT DOLLAR ...|business|[0.6152751974127849,0.21709...|  0.0|       0.0|
|LONDON  A RESURGENT DOLLAR ...|business|[0.6152751974127849,0.21709...|  0.0|       0.0|
|LONDON  A RESURGENT DOLLAR ...|business|[0.6152751974127849,0.21709...|  0.0|       0.0|
|LONDON  A RESURGENT DOLLAR ...|business|[0.6152751974127849,0.21709...|  0.0|       0.0|
|LONDON  A RESURGENT DOLLAR ...|business|[0.6152751974127849,0.21709...|  0.0|       0.0|
|LONDON  A

In [25]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print(str(evaluator.evaluate(predictions)*100)+"%")


51.71334098816666%
