Juber Rahman </br>
date: 08/27/2020

In [3]:
from pyspark.ml import Pipeline 
from pyspark.ml.feature import CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover
from pyspark.sql.functions import col, udf,regexp_replace,isnull
from pyspark.sql.types import StringType,IntegerType
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [18]:
data_path = 'uci-news-aggregator.csv'
news_data = spark.read.csv(data_path,header = 'True',inferSchema='True')

In [19]:
news_data.show()

+---+--------------------+--------------------+--------------------+--------+--------------------+--------------------+-------------+
| ID|               TITLE|                 URL|           PUBLISHER|CATEGORY|               STORY|            HOSTNAME|    TIMESTAMP|
+---+--------------------+--------------------+--------------------+--------+--------------------+--------------------+-------------+
|  1|Fed official says...|http://www.latime...|   Los Angeles Times|       b|ddUyU0VZz0BRneMio...|     www.latimes.com|1394470370698|
|  2|Fed's Charles Plo...|http://www.livemi...|            Livemint|       b|ddUyU0VZz0BRneMio...|    www.livemint.com|1394470371207|
|  3|US open: Stocks f...|http://www.ifamag...|        IFA Magazine|       b|ddUyU0VZz0BRneMio...| www.ifamagazine.com|1394470371550|
|  4|Fed risks falling...|http://www.ifamag...|        IFA Magazine|       b|ddUyU0VZz0BRneMio...| www.ifamagazine.com|1394470371793|
|  5|Fed's Plosser: Na...|http://www.moneyn...|           Mone

In [20]:
news_data.count()

73485

In [21]:
title_category = news_data.select("TITLE","CATEGORY")

In [22]:
title_category.show()

+--------------------+--------+
|               TITLE|CATEGORY|
+--------------------+--------+
|Fed official says...|       b|
|Fed's Charles Plo...|       b|
|US open: Stocks f...|       b|
|Fed risks falling...|       b|
|Fed's Plosser: Na...|       b|
|Plosser: Fed May ...|       b|
|Fed's Plosser: Ta...|       b|
|Fed's Plosser exp...|       b|
|US jobs growth la...|       b|
|ECB unlikely to e...|       b|
|ECB unlikely to e...|       b|
|EU's half-baked b...|       b|
|Europe reaches cr...|       b|
|ECB FOCUS-Stronge...|       b|
|EU aims for deal ...|       b|
|Forex - Pound dro...|       b|
|Noyer Says Strong...|       b|
|EU Week Ahead Mar...|       b|
|ECB member Noyer ...|       b|
|Euro Anxieties Wa...|       b|
+--------------------+--------+
only showing top 20 rows



In [23]:
def null_value_count(df):
  null_columns_counts = []
  numRows = df.count()
  for k in df.columns:
    nullRows = df.where(col(k).isNull()).count()
    if(nullRows > 0):
      temp = k,nullRows
      null_columns_counts.append(temp)
  return(null_columns_counts)

In [24]:
null_columns_count_list = null_value_count(title_category)

In [25]:
spark.createDataFrame(null_columns_count_list, ['Column_With_Null_Value', 'Null_Values_Count']).show()

+----------------------+-----------------+
|Column_With_Null_Value|Null_Values_Count|
+----------------------+-----------------+
|                 TITLE|               61|
|              CATEGORY|               75|
+----------------------+-----------------+



In [26]:
title_category = title_category.dropna()


In [27]:
title_category.count()

73410

In [28]:
title_category.show(truncate=False)

+---------------------------------------------------------------------------+--------+
|TITLE                                                                      |CATEGORY|
+---------------------------------------------------------------------------+--------+
|Fed official says weak data caused by weather, should not slow taper       |b       |
|Fed's Charles Plosser sees high bar for change in pace of tapering         |b       |
|US open: Stocks fall after Fed official hints at accelerated tapering      |b       |
|Fed risks falling 'behind the curve', Charles Plosser says                 |b       |
|Fed's Plosser: Nasty Weather Has Curbed Job Growth                         |b       |
|Plosser: Fed May Have to Accelerate Tapering Pace                          |b       |
|Fed's Plosser: Taper pace may be too slow                                  |b       |
|Fed's Plosser expects US unemployment to fall to 6.2% by the end of 2014   |b       |
|US jobs growth last month hit by weather:F

In [29]:
title_category.select("Category").distinct().count()

78

In [30]:
title_category.groupBy("Category").count().orderBy(col("count").desc()).show(truncate=False)

+-------------------------------------+-----+
|Category                             |count|
+-------------------------------------+-----+
|e                                    |25372|
|b                                    |21175|
|t                                    |18462|
|m                                    |8296 |
|GossipCop                            |5    |
|Us Magazine                          |4    |
|The Hollywood Gossip                 |4    |
|Gamepur                              |4    |
|Contactmusic.com                     |4    |
|Fast Company                         |3    |
|Complex.com                          |3    |
|Reality TV World                     |3    |
|TooFab.com                           |3    |
|Mobile Burn                          |3    |
|The Escapist                         |2    |
|Fashion Times                        |2    |
|FasterLouder                         |2    |
|NBC Bay Area                         |2    |
|WFJA Classic Hits and Oldies 105.

In [31]:
title_category.groupBy("TITLE").count().orderBy(col("count").desc()).show(truncate=False)


+---------------------------------------------------------------------+-----+
|TITLE                                                                |count|
+---------------------------------------------------------------------+-----+
|(click the phrases to see a list)                                    |15   |
|Business Highlights                                                  |12   |
|Get the Most Popular Beauty World News Stories in a Weekly Newsletter|11   |
|PR Newswire                                                          |8    |
|AP News in Brief at 5:58 a.m. EDT                                    |6    |
|Business Wire                                                        |6    |
|From ColumbusAlive.com                                               |6    |
|International markets roundup                                        |6    |
|Visionary or looney? Zuckerberg on spending spree                    |6    |
|Russell Crowe calls 'Noah' criticism 'irrational'              

In [32]:
title_category = title_category.withColumn("only_str",regexp_replace(col('TITLE'), '\d+', ''))

In [33]:
title_category.select("TITLE","only_str").show(truncate=False)

+---------------------------------------------------------------------------+---------------------------------------------------------------------------+
|TITLE                                                                      |only_str                                                                   |
+---------------------------------------------------------------------------+---------------------------------------------------------------------------+
|Fed official says weak data caused by weather, should not slow taper       |Fed official says weak data caused by weather, should not slow taper       |
|Fed's Charles Plosser sees high bar for change in pace of tapering         |Fed's Charles Plosser sees high bar for change in pace of tapering         |
|US open: Stocks fall after Fed official hints at accelerated tapering      |US open: Stocks fall after Fed official hints at accelerated tapering      |
|Fed risks falling 'behind the curve', Charles Plosser says                 

In [34]:
regex_tokenizer = RegexTokenizer(inputCol="only_str", outputCol="words", pattern="\\W")
raw_words = regex_tokenizer.transform(title_category)


In [35]:
raw_words.show()

+--------------------+--------+--------------------+--------------------+
|               TITLE|CATEGORY|            only_str|               words|
+--------------------+--------+--------------------+--------------------+
|Fed official says...|       b|Fed official says...|[fed, official, s...|
|Fed's Charles Plo...|       b|Fed's Charles Plo...|[fed, s, charles,...|
|US open: Stocks f...|       b|US open: Stocks f...|[us, open, stocks...|
|Fed risks falling...|       b|Fed risks falling...|[fed, risks, fall...|
|Fed's Plosser: Na...|       b|Fed's Plosser: Na...|[fed, s, plosser,...|
|Plosser: Fed May ...|       b|Plosser: Fed May ...|[plosser, fed, ma...|
|Fed's Plosser: Ta...|       b|Fed's Plosser: Ta...|[fed, s, plosser,...|
|Fed's Plosser exp...|       b|Fed's Plosser exp...|[fed, s, plosser,...|
|US jobs growth la...|       b|US jobs growth la...|[us, jobs, growth...|
|ECB unlikely to e...|       b|ECB unlikely to e...|[ecb, unlikely, t...|
|ECB unlikely to e...|       b|ECB unl

In [36]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
words_df = remover.transform(raw_words)

In [37]:
words_df.select("words","filtered").show()


+--------------------+--------------------+
|               words|            filtered|
+--------------------+--------------------+
|[fed, official, s...|[fed, official, s...|
|[fed, s, charles,...|[fed, charles, pl...|
|[us, open, stocks...|[us, open, stocks...|
|[fed, risks, fall...|[fed, risks, fall...|
|[fed, s, plosser,...|[fed, plosser, na...|
|[plosser, fed, ma...|[plosser, fed, ma...|
|[fed, s, plosser,...|[fed, plosser, ta...|
|[fed, s, plosser,...|[fed, plosser, ex...|
|[us, jobs, growth...|[us, jobs, growth...|
|[ecb, unlikely, t...|[ecb, unlikely, e...|
|[ecb, unlikely, t...|[ecb, unlikely, e...|
|[eu, s, half, bak...|[eu, half, baked,...|
|[europe, reaches,...|[europe, reaches,...|
|[ecb, focus, stro...|[ecb, focus, stro...|
|[eu, aims, for, d...|[eu, aims, deal, ...|
|[forex, pound, dr...|[forex, pound, dr...|
|[noyer, says, str...|[noyer, says, str...|
|[eu, week, ahead,...|[eu, week, ahead,...|
|[ecb, member, noy...|[ecb, member, noy...|
|[euro, anxieties,...|[euro, anx

In [38]:
indexer = StringIndexer(inputCol="CATEGORY", outputCol="categoryIndex")
feature_data = indexer.fit(words_df).transform(words_df)


In [39]:
feature_data.select("CATEGORY","categoryIndex").show()


+--------+-------------+
|CATEGORY|categoryIndex|
+--------+-------------+
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
|       b|          1.0|
+--------+-------------+
only showing top 20 rows



In [40]:
cv = CountVectorizer(inputCol="filtered", outputCol="features")
model = cv.fit(feature_data)
countVectorizer_feateures = model.transform(feature_data)


In [41]:
(trainingData, testData) = countVectorizer_feateures.randomSplit([0.8, 0.2],seed = 11)

In [42]:
nb = NaiveBayes(modelType="multinomial",labelCol="categoryIndex", featuresCol="features")
nbModel = nb.fit(trainingData)
nb_predictions = nbModel.transform(testData)


In [43]:
nb_predictions.select("prediction", "categoryIndex", "features").show(5)

+----------+-------------+--------------------+
|prediction|categoryIndex|            features|
+----------+-------------+--------------------+
|       0.0|          0.0|(21944,[28,180,70...|
|       0.0|          0.0|(21944,[31,704,14...|
|       0.0|          0.0|(21944,[83,109,11...|
|       0.0|          0.0|(21944,[19,77,203...|
|       0.0|         12.0|(21944,[203,324,7...|
+----------+-------------+--------------------+
only showing top 5 rows



In [45]:
evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy")
nb_accuracy = evaluator.evaluate(nb_predictions)
print("Accuracy of NaiveBayes is = %g"% (nb_accuracy))
print("Test Error of NaiveBayes = %g " % (1.0 - nb_accuracy))


Accuracy of NaiveBayes is = 0.945332
Test Error of NaiveBayes = 0.054668 
