In [39]:
import spacy as sp
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
#referring to chapter six Data Analysis with Python and PySpark for guidance

In [40]:
spark = SparkSession.Builder().master('local[1]').appName("text").getOrCreate()

In [41]:
# Unlike CSV data, JSON data doesn’t need to worry about record delimiters or inferring data types (JSON forces the usage of string delimiters, so the value 03843 is a number, where "03843" is a string) - chapt 6 Data analysis with PySpark, Manning
data = spark.read.json('../data/batch.json', multiLine=True)

In [42]:
#  count was 17 when multiLine = False, when false, Json treated in regular dataFrame format
# when interacting with pySpark, may be better to put new entries within single lines
#eg: {id: 1, article: python, date: 2022}
#    {id: 2, article: pyspark, date: 2022}
# this allows us to operate on it like it were a csv file - worth considering
data.count()

12

In [43]:
# inferred schema is pretty good, even the array within the keywords section is maintained appropriately
data.printSchema()

root
 |-- _id: long (nullable = true)
 |-- author: string (nullable = true)
 |-- content: string (nullable = true)
 |-- date_published: long (nullable = true)
 |-- dek: string (nullable = true)
 |-- direction: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- lead_image_url: string (nullable = true)
 |-- next_page_url: string (nullable = true)
 |-- rendered_pages: long (nullable = true)
 |-- title: string (nullable = true)
 |-- total_pages: long (nullable = true)
 |-- url: string (nullable = true)
 |-- word_count: long (nullable = true)



In [44]:
print(data.columns)

['_id', 'author', 'content', 'date_published', 'dek', 'direction', 'domain', 'excerpt', 'keywords', 'lead_image_url', 'next_page_url', 'rendered_pages', 'title', 'total_pages', 'url', 'word_count']


In [45]:
data.show()

+---+--------------------+--------------------+--------------+----+---------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+--------------------+-----------+--------------------+----------+
|_id|              author|             content|date_published| dek|direction|              domain|             excerpt|            keywords|      lead_image_url|next_page_url|rendered_pages|               title|total_pages|                 url|word_count|
+---+--------------------+--------------------+--------------+----+---------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+--------------------+-----------+--------------------+----------+
|  1|          Matt Makai|\nAmazon Web Serv...|          2021|null|      ltr|     fullstackpython|Learn how to use ...|    [python, lambda]|https://www.fulls...|         null|             1|Application Perfo...|          1|https://w

In [50]:
# explode out the keywords
data2 = data.select("_id", explode("keywords").alias("wordsonline"))
data2.show()
# pyspark allows for greater control over data processing than in pandas, here I can operate on the exploded lists given their keywords in a simpler cleaner fashion than with pandas (see proof_of_concept.ipynb)

+---+----------------+
|_id|     wordsonline|
+---+----------------+
|  1|          python|
|  1|          lambda|
|  2|          python|
|  2|        scraping|
|  3|          python|
|  3|        openPyxl|
|  4|          python|
|  4|machine learning|
|  5|          python|
|  5| escape sequence|
|  6|          python|
|  6|           learn|
|  7|          python|
|  7|          lambda|
|  8|          python|
|  8|   web developer|
|  9|          python|
| 10|          python|
| 10|            CRUD|
| 11|          python|
+---+----------------+
only showing top 20 rows

