In [1]:
from pyspark.sql import *
import singer
import json
from pprint import pprint

In [2]:
json_schema = {
  "properties": {
    "id": {
        "type": "integer"
    },
    "name": {
        "type": "string"
    },
    "age": {
        "maximum": 90,
        "minimum": 16,
        "type": "integer"
    },
    "has_children": {
        "type": "boolean"
    },
  },
  "$id": "my_user_schema.json",
  "$schema": "http://json-schema.org/draft-07/schema#"
}

singer.write_schema(schema=json_schema,
                    stream_name='DC_employees',
                    key_properties=['id'])

{"type": "SCHEMA", "stream": "DC_employees", "schema": {"properties": {"id": {"type": "integer"}, "name": {"type": "string"}, "age": {"maximum": 90, "minimum": 16, "type": "integer"}, "has_children": {"type": "boolean"}}, "$id": "my_user_schema.json", "$schema": "http://json-schema.org/draft-07/schema#"}, "key_properties": ["id"]}


In [3]:
products_schema = {
  "properties":{
    "brand":{
        "type":"string"
    },
    "model":{
        "type":"string"
    },
    "price":{
        "type":"number"
    },
    "currency":{
        "type":"string"
    },
    "quantity":{
        "type":"number",
        "minimum":1
    },
    "date":{
        "type":"string",
        "format":"date"
    },
    "countrycode":{
        "type":"string",
        "pattern":"^[A-Z]{2}$"
    },
    "store_name":{
        "type":"string"
    }
  }
}

singer.write_schema(stream_name='products', schema=products_schema, key_properties=[])

{"type": "SCHEMA", "stream": "products", "schema": {"properties": {"brand": {"type": "string"}, "model": {"type": "string"}, "price": {"type": "number"}, "currency": {"type": "string"}, "quantity": {"type": "number", "minimum": 1}, "date": {"type": "string", "format": "date"}, "countrycode": {"type": "string", "pattern": "^[A-Z]{2}$"}, "store_name": {"type": "string"}}}, "key_properties": []}


In [4]:
columns = ("id", "name", "age", "has_children")
users = {
    (1, "Adrian", 32, False),
    (2, "Ruanne", 28, False),
    (3, "Hilary", 29, True)
}

singer.write_record(stream_name='DC_employees', record=dict(zip(columns, users.pop())))

{"type": "RECORD", "stream": "DC_employees", "record": {"id": 3, "name": "Hilary", "age": 29, "has_children": true}}


In [5]:
fixed_dict = { "type": "RECORD", "stream": "DC_employees" }
record_msg = { **fixed_dict, "record": dict(zip(columns, users.pop()))}
print(json.dumps(record_msg))

{"type": "RECORD", "stream": "DC_employees", "record": {"id": 2, "name": "Ruanne", "age": 28, "has_children": false}}


In [6]:
spark = SparkSession.builder.getOrCreate()

boston_housing = spark.read.options(header=True).csv('pyspark-ML-in-Colab/BostonHousing.csv')
print(boston_housing.schema)
boston_housing.show()

StructType(List(StructField(crim,StringType,true),StructField(zn,StringType,true),StructField(indus,StringType,true),StructField(chas,StringType,true),StructField(nox,StringType,true),StructField(rm,StringType,true),StructField(age,StringType,true),StructField(dis,StringType,true),StructField(rad,StringType,true),StructField(tax,StringType,true),StructField(ptratio,StringType,true),StructField(b,StringType,true),StructField(lstat,StringType,true),StructField(medv,StringType,true)))
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|0.00632|  18| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|  24|
|0.02731|   0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729|   0| 7.07|   0|0.469|7.185|61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237|