In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

In [3]:
spark

In [8]:
df = spark.read.json(r'C:\Users\Kunj.Kinger\Desktop\spark\archive\iris.json')

In [9]:
df.show()

+---------------+-----------+----------+-----------+----------+-------+
|_corrupt_record|petalLength|petalWidth|sepalLength|sepalWidth|species|
+---------------+-----------+----------+-----------+----------+-------+
|              [|       null|      null|       null|      null|   null|
|           null|        1.4|       0.2|        5.1|       3.5| setosa|
|           null|        1.4|       0.2|        4.9|       3.0| setosa|
|           null|        1.3|       0.2|        4.7|       3.2| setosa|
|           null|        1.5|       0.2|        4.6|       3.1| setosa|
|           null|        1.4|       0.2|        5.0|       3.6| setosa|
|           null|        1.7|       0.4|        5.4|       3.9| setosa|
|           null|        1.4|       0.3|        4.6|       3.4| setosa|
|           null|        1.5|       0.2|        5.0|       3.4| setosa|
|           null|        1.4|       0.2|        4.4|       2.9| setosa|
|           null|        1.5|       0.1|        4.9|       3.1| 

In [10]:
df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- petalLength: double (nullable = true)
 |-- petalWidth: double (nullable = true)
 |-- sepalLength: double (nullable = true)
 |-- sepalWidth: double (nullable = true)
 |-- species: string (nullable = true)



In [11]:
df.columns

['_corrupt_record',
 'petalLength',
 'petalWidth',
 'sepalLength',
 'sepalWidth',
 'species']

In [12]:
df.describe()

DataFrame[summary: string, _corrupt_record: string, petalLength: string, petalWidth: string, sepalLength: string, sepalWidth: string, species: string]

In [13]:
df.describe().show()

+-------+---------------+------------------+------------------+------------------+-------------------+---------+
|summary|_corrupt_record|       petalLength|        petalWidth|       sepalLength|         sepalWidth|  species|
+-------+---------------+------------------+------------------+------------------+-------------------+---------+
|  count|              2|               150|               150|               150|                150|      150|
|   mean|           null|3.7580000000000027| 1.199333333333334| 5.843333333333335|  3.057333333333334|     null|
| stddev|           null|1.7652982332594662|0.7622376689603467|0.8280661279778637|0.43586628493669793|     null|
|    min|              [|               1.0|               0.1|               4.3|                2.0|   setosa|
|    max|              ]|               6.9|               2.5|               7.9|                4.4|virginica|
+-------+---------------+------------------+------------------+------------------+--------------

In [20]:
from pyspark.sql.types import StructField,StringType,IntegerType,StructType,FloatType

In [26]:
data_schema = [StructField("sepalWidth",FloatType(),True), StructField("species",StringType(),True),
               StructField("petalWidth",FloatType(),True)]

In [27]:
final_struct = StructType(fields=data_schema)

In [28]:
df = spark.read.json(r"C:\Users\Kunj.Kinger\Desktop\spark\archive\iris.json",schema=final_struct)

In [29]:
df.printSchema()

root
 |-- sepalWidth: float (nullable = true)
 |-- species: string (nullable = true)
 |-- petalWidth: float (nullable = true)



# Grabbing the data

In [32]:
df['species'].show()

TypeError: 'Column' object is not callable

In [39]:
species = df.select('species').distinct()

In [42]:
species.show()

+----------+
|   species|
+----------+
| virginica|
|      null|
|versicolor|
|    setosa|
+----------+



In [45]:
df.withColumn('newWidth',df['sepalWidth'] + 1.0).show(5)

+----------+-------+----------+-----------------+
|sepalWidth|species|petalWidth|         newWidth|
+----------+-------+----------+-----------------+
|      null|   null|      null|             null|
|       3.5| setosa|       0.2|              4.5|
|       3.0| setosa|       0.2|              4.0|
|       3.2| setosa|       0.2|4.200000047683716|
|       3.1| setosa|       0.2|4.099999904632568|
+----------+-------+----------+-----------------+
only showing top 5 rows



In [47]:
df.withColumnRenamed('sepalWidth','Swidth').show(5)

+------+-------+----------+
|Swidth|species|petalWidth|
+------+-------+----------+
|  null|   null|      null|
|   3.5| setosa|       0.2|
|   3.0| setosa|       0.2|
|   3.2| setosa|       0.2|
|   3.1| setosa|       0.2|
+------+-------+----------+
only showing top 5 rows



# Using SQL

In [48]:
#regsiter the DataFrame as a SQL temporary view

In [50]:
df.createOrReplaceTempView("iris")

In [51]:
sql_results = spark.sql("select * from iris")

In [52]:
sql_results

DataFrame[sepalWidth: float, species: string, petalWidth: float]

In [53]:
sql_results.show(2)

+----------+-------+----------+
|sepalWidth|species|petalWidth|
+----------+-------+----------+
|      null|   null|      null|
|       3.5| setosa|       0.2|
+----------+-------+----------+
only showing top 2 rows

