# PySpark example
This notebook demonstarates basic use of Apache Spark using Python

In [1]:
# Import SparkSession
from pyspark.sql import SparkSession

In [2]:
# Initialize Spark session
spark = SparkSession \
    .builder \
    .appName("PySpark example") \
    .getOrCreate()

In [3]:
# Load Titanic dataset to dataframes
titanic_data_df = spark.read.csv("data/Titanic_data.csv",header=True,inferSchema=True)

---
Data exploring

In [4]:
# Print datatype
titanic_data_df

DataFrame[id: int, PClass: string, Age: double, Gender: string, Survived: int, GenderCode: int]

In [5]:
titanic_data_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- PClass: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- GenderCode: integer (nullable = true)



In [6]:
# Describe the data
titanic_data_df.describe().show()

+-------+-----------------+------+------------------+------+-------------------+-------------------+
|summary|               id|PClass|               Age|Gender|           Survived|         GenderCode|
+-------+-----------------+------+------------------+------+-------------------+-------------------+
|  count|             1313|  1313|              1313|  1313|               1313|               1313|
|   mean|            657.0|  null|17.502574257425742|  null| 0.3427265803503427| 0.3518659558263519|
| stddev|379.1747618183468|  null| 18.51694476332819|  null|0.47480181908910607|0.47773437008438874|
|    min|                1|     *|               0.0|female|                  0|                  0|
|    max|             1313|   3rd|              71.0|  male|                  1|                  1|
+-------+-----------------+------+------------------+------+-------------------+-------------------+



In [7]:
# Count lines
titanic_data_df.count()

1313

In [8]:
# Show first 10 lines
titanic_data_df.show(10)

+---+------+----+------+--------+----------+
| id|PClass| Age|Gender|Survived|GenderCode|
+---+------+----+------+--------+----------+
|  1|   1st|29.0|female|       1|         1|
|  2|   1st| 2.0|female|       0|         1|
|  3|   1st|30.0|  male|       0|         0|
|  4|   1st|25.0|female|       0|         1|
|  5|   1st|0.92|  male|       1|         0|
|  6|   1st|47.0|  male|       1|         0|
|  7|   1st|63.0|female|       1|         1|
|  8|   1st|39.0|  male|       0|         0|
|  9|   1st|58.0|female|       1|         1|
| 10|   1st|71.0|  male|       0|         0|
+---+------+----+------+--------+----------+
only showing top 10 rows



In [9]:
# Check the count of survivors and victims
titanic_data_df.groupby("Survived").count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  450|
|       0|  863|
+--------+-----+



---
Create temporary view to use SQL language

In [10]:
titanic_data_df.createOrReplaceTempView("data")

In [11]:
# Check different passenger classes
spark.sql("SELECT DISTINCT PClass FROM data").show()

+------+
|PClass|
+------+
|   2nd|
|     *|
|   1st|
|   3rd|
+------+



In [12]:
# What is that * class?
spark.sql("SELECT * FROM data WHERE PClass = '*'").show()

+---+------+---+------+--------+----------+
| id|PClass|Age|Gender|Survived|GenderCode|
+---+------+---+------+--------+----------+
|457|     *|0.0|  male|       0|         0|
+---+------+---+------+--------+----------+



In [13]:
# Removing that * class and saving counts per classes to dataframe
class_counts = spark.sql("SELECT PClass, COUNT(PClass) AS Passengers FROM data WHERE PClass != '*' GROUP BY PClass")
class_counts.show()

+------+----------+
|PClass|Passengers|
+------+----------+
|   2nd|       279|
|   1st|       322|
|   3rd|       711|
+------+----------+



In [14]:
survivors_per_class = spark.sql("SELECT PClass, COUNT(PClass) AS Survivors FROM data WHERE PClass != '*' AND Survived = 1 GROUP BY PClass")
survivors_per_class.show()

+------+---------+
|PClass|Survivors|
+------+---------+
|   2nd|      119|
|   1st|      193|
|   3rd|      138|
+------+---------+



In [15]:
joined_counts = class_counts.join(survivors_per_class, 'PClass')
joined_counts.show()

+------+----------+---------+
|PClass|Passengers|Survivors|
+------+----------+---------+
|   2nd|       279|      119|
|   1st|       322|      193|
|   3rd|       711|      138|
+------+----------+---------+



In [16]:
# Writed data from memory to persisten storage using parquet format
joined_counts.write.parquet("data/titanic_counts.parquet")

In [17]:
# Read back to DataFrame
new_df = spark.read.parquet("data/titanic_counts.parquet")

In [18]:
new_df.show()

+------+----------+---------+
|PClass|Passengers|Survivors|
+------+----------+---------+
|   2nd|       279|      119|
|   1st|       322|      193|
|   3rd|       711|      138|
+------+----------+---------+

