In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local") \
        .appName("Data Frame") \
        .config("spark.ui.port", "4050") \
        .getOrCreate()

sc = spark.sparkContext

In [3]:
stringJSONRDD = sc.parallelize((
    """{"id": "123", "name": "Katie", "age": 19, "eyeColor": "brown"}""",
    """{"id": "234", "name": "Michael", "age": 22, "eyeColor": "green"}""",
    """{"id": "345", "name": "Simone", "age": 23, "eyeColor": "blue"}""",
))

In [4]:
swimmersJSON = spark.read.json(stringJSONRDD) # spark.read 는 narrow transformation function 이다

                                                                                

In [5]:
swimmersJSON.createOrReplaceTempView('swimmersJSON')

In [6]:
swimmersJSON.show()

+---+--------+---+-------+
|age|eyeColor| id|   name|
+---+--------+---+-------+
| 19|   brown|123|  Katie|
| 22|   green|234|Michael|
| 23|    blue|345| Simone|
+---+--------+---+-------+



In [8]:
print(type(stringJSONRDD))
print(stringJSONRDD.collect())

<class 'pyspark.rdd.RDD'>
['{"id": "123", "name": "Katie", "age": 19, "eyeColor": "brown"}', '{"id": "234", "name": "Michael", "age": 22, "eyeColor": "green"}', '{"id": "345", "name": "Simone", "age": 23, "eyeColor": "blue"}']


In [10]:
print(type(swimmersJSON))
print(swimmersJSON.collect())
swimmersJSON.show()

<class 'pyspark.sql.dataframe.DataFrame'>
[Row(age=19, eyeColor='brown', id='123', name='Katie'), Row(age=22, eyeColor='green', id='234', name='Michael'), Row(age=23, eyeColor='blue', id='345', name='Simone')]
+---+--------+---+-------+
|age|eyeColor| id|   name|
+---+--------+---+-------+
| 19|   brown|123|  Katie|
| 22|   green|234|Michael|
| 23|    blue|345| Simone|
+---+--------+---+-------+



In [11]:
swimmersJSON.printSchema()

root
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



In [12]:
from pyspark.sql.types import *

stringCSVRDD = sc.parallelize(
    [
        (123, 'Katie', 19, 'brown'),
        (234, 'Michael', 22, 'green'),
        (345, 'Simone', 23, 'blue'),
    ]
)

schemaString = 'id name age eyeColor'
schema = StructType([
    StructField('id', LongType(), True),
    StructField('name', StringType(), True),
    StructField('age', LongType(), True),
    StructField('eyeColor', StringType(), True),
])

swimmers = spark.createDataFrame(stringCSVRDD, schema)
swimmers.show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [13]:
swimmers.collect()

[Row(id=123, name='Katie', age=19, eyeColor='brown'),
 Row(id=234, name='Michael', age=22, eyeColor='green'),
 Row(id=345, name='Simone', age=23, eyeColor='blue')]

In [14]:
swimmers.take(1)

[Row(id=123, name='Katie', age=19, eyeColor='brown')]

In [15]:
swimmers.select('*').show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [16]:
from pyspark.sql.functions import lit
swimmers.withColumn('one', lit(1)).show()

+---+-------+---+--------+---+
| id|   name|age|eyeColor|one|
+---+-------+---+--------+---+
|123|  Katie| 19|   brown|  1|
|234|Michael| 22|   green|  1|
|345| Simone| 23|    blue|  1|
+---+-------+---+--------+---+



In [17]:
sc.stop()