# Jonathan Halverson
# Monday, December 26, 2016
# Creating a schema

In [17]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("Schema").getOrCreate()

In [18]:
type(spark)

pyspark.sql.session.SparkSession

In [19]:
cars = spark.read.csv('cars.dat', header=False, inferSchema=False)
cars.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [20]:
cars = spark.read.csv('cars.dat', header=False, inferSchema=True)
cars.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: string (nullable = true)



In [21]:
cars.show()

+-----+------+-----+---+------+---+
|  _c0|   _c1|  _c2|_c3|   _c4|_c5|
+-----+------+-----+---+------+---+
|Sedan| Camry|31000|  2| 65.91|  D|
|Sedan| Civic|27000|  1| 99.05|  D|
|Truck|  F150|40500|  3|1100.0|  E|
|Truck|Ranger|32000|  3| 950.0|  E|
+-----+------+-----+---+------+---+



In [22]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, DoubleType, IntegerType

fields = [StructField("Vehicle", StringType(), True),
          StructField("Model", StringType(), True),
          StructField("Cost", IntegerType(), True),
          StructField("Type", IntegerType(), True),
          StructField("Labor", DoubleType(), True),
          StructField("Class", StringType(), True)]
schema_ = StructType(fields)

In [23]:
cars = spark.read.csv('cars.dat', schema=schema_, header=False, inferSchema=False)
cars.printSchema()

root
 |-- Vehicle: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Cost: integer (nullable = true)
 |-- Type: integer (nullable = true)
 |-- Labor: double (nullable = true)
 |-- Class: string (nullable = true)



In [24]:
cars.show()

+-------+------+-----+----+------+-----+
|Vehicle| Model| Cost|Type| Labor|Class|
+-------+------+-----+----+------+-----+
|  Sedan| Camry|31000|   2| 65.91|    D|
|  Sedan| Civic|27000|   1| 99.05|    D|
|  Truck|  F150|40500|   3|1100.0|    E|
|  Truck|Ranger|32000|   3| 950.0|    E|
+-------+------+-----+----+------+-----+



Compute the Pearson correlation:

In [25]:
cars.corr("Cost", "Labor")

0.7936960369307345

In [26]:
cars.crosstab('Vehicle', 'Model').show()

+-------------+-----+-----+----+------+
|Vehicle_Model|Camry|Civic|F150|Ranger|
+-------------+-----+-----+----+------+
|        Sedan|    1|    1|   0|     0|
|        Truck|    0|    0|   1|     1|
+-------------+-----+-----+----+------+



Add a column:

In [27]:
cars.withColumn('newCol', cars.Cost * 2).withColumn('twoCost', cars.Cost.cast('float')).show()

+-------+------+-----+----+------+-----+------+-------+
|Vehicle| Model| Cost|Type| Labor|Class|newCol|twoCost|
+-------+------+-----+----+------+-----+------+-------+
|  Sedan| Camry|31000|   2| 65.91|    D| 62000|31000.0|
|  Sedan| Civic|27000|   1| 99.05|    D| 54000|27000.0|
|  Truck|  F150|40500|   3|1100.0|    E| 81000|40500.0|
|  Truck|Ranger|32000|   3| 950.0|    E| 64000|32000.0|
+-------+------+-----+----+------+-----+------+-------+



In [28]:
cars.filter(cars.Cost < 35000).select('Model', 'Cost').show()

+------+-----+
| Model| Cost|
+------+-----+
| Camry|31000|
| Civic|27000|
|Ranger|32000|
+------+-----+



In [29]:
cars.select('Model', 'Class').filter(cars.Cost < 35000).show()

+------+-----+
| Model|Class|
+------+-----+
| Camry|    D|
| Civic|    D|
|Ranger|    E|
+------+-----+



### Running SQL queries

In [30]:
cars.createOrReplaceTempView("cars")
spark.sql("""select Model, Cost from cars where Cost < 35000""").show()

+------+-----+
| Model| Cost|
+------+-----+
| Camry|31000|
| Civic|27000|
|Ranger|32000|
+------+-----+

