# Titanlic Data Analysis

#### Creating a SparkSession

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Titanic Data Analysis").getOrCreate()

#### Setting up the dataFile and the schema structure of the data

In [2]:
dataFile = "./data/titanic_data.txt"

schemaStruct = StructType([StructField("rowid",StringType(),True), 
                           StructField("pclass",StringType(),True), 
                           StructField("survived",IntegerType(),True), 
                           StructField("name",StringType(),True), 
                           StructField("age",FloatType(),True), 
                           StructField("embarked",StringType(),True), 
                           StructField("home.dest",StringType(),True),
                           StructField("room",StringType(),True),
                           StructField("ticket",StringType(),True),
                           StructField("boat",StringType(),True), 
                           StructField("sex",StringType(),True)
                           ])

#### Reading the data into a spark DataFrame

In [3]:
df = spark.read.csv(dataFile,schema=schemaStruct)

#### Checking the schema of the DataFrame

In [4]:
df.printSchema()

root
 |-- rowid: string (nullable = true)
 |-- pclass: string (nullable = true)
 |-- survived: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: float (nullable = true)
 |-- embarked: string (nullable = true)
 |-- home.dest: string (nullable = true)
 |-- room: string (nullable = true)
 |-- ticket: string (nullable = true)
 |-- boat: string (nullable = true)
 |-- sex: string (nullable = true)



#### Checking few records

In [5]:
df.show(5,False)

+-----+------+--------+-----------------------------------------------+------+-----------+-------------------------------+----+----------+-----+------+
|rowid|pclass|survived|name                                           |age   |embarked   |home.dest                      |room|ticket    |boat |sex   |
+-----+------+--------+-----------------------------------------------+------+-----------+-------------------------------+----+----------+-----+------+
|1    |1st   |1       |Allen, Miss Elisabeth Walton                   |29.0  |Southampton|St Louis, MO                   |B-5 |24160 L221|2    |female|
|2    |1st   |0       |Allison, Miss Helen Loraine                    |2.0   |Southampton|Montreal, PQ / Chesterville, ON|C26 |null      |null |female|
|3    |1st   |0       |Allison, Mr Hudson Joshua Creighton            |30.0  |Southampton|Montreal, PQ / Chesterville, ON|C26 |null      |(135)|male  |
|4    |1st   |0       |Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)|25.0  |Southampto

#### Changing the data types for rowid and age to Integer

In [6]:
(df
 .withColumn("rowid",col("rowid").cast("int"))
 .withColumn("age",col("age").cast("int"))
)

DataFrame[rowid: int, pclass: string, survived: int, name: string, age: int, embarked: string, home.dest: string, room: string, ticket: string, boat: string, sex: string]

#### Creating a Temporary view on top the DataFrame

In [7]:
df.createOrReplaceTempView('titanic')

#### Q1. Find the average age of people who died and who survived

In [8]:
(df
 .withColumn("Survival",when(col("survived") == 1,"Survived").otherwise("Not Survived"))
 .select("Survival","age")
 .groupBy("Survival")
 .agg(avg("age").cast('int'))
 .show()
)              

+------------+---------------------+
|    Survival|CAST(avg(age) AS INT)|
+------------+---------------------+
|Not Survived|                   32|
|    Survived|                   29|
+------------+---------------------+



#### Using SQL query

In [9]:
spark.sql("""
SELECT CASE WHEN survived = 1 THEN 'Survived'
        ELSE 'Not Survived' END AS survived
     , CAST(avg(age) AS INT) AS avg_age
  FROM titanic
 GROUP BY 1""").show()

+------------+-------+
|    survived|avg_age|
+------------+-------+
|Not Survived|     32|
|    Survived|     29|
+------------+-------+



#### Q2. Number of males and females survived in following age range: age <= 20, 20 < age <= 50 and (50 < age and age =NA)

In [10]:
(df
.withColumn("AgeInd",expr("CASE WHEN age <= 20 THEN 'age<=20'"+
                          " WHEN age > 20 and age <= 50 THEN ' 20 < age <=50'"+
                          " ELSE '50 < age and age= NA' END"))
.groupBy("sex","AgeInd")
.count()
.show()
)

+------+--------------------+-----+
|   sex|              AgeInd|count|
+------+--------------------+-----+
|  male|             age<=20|   78|
|female|             age<=20|   67|
|  male|50 < age and age= NA|  503|
|female|       20 < age <=50|  151|
|female|50 < age and age= NA|  245|
|  male|       20 < age <=50|  269|
+------+--------------------+-----+



#### Using SQL query

In [11]:
spark.sql("""
SELECT sex
     , CASE WHEN age<= 20 THEN 'age<=20'
            WHEN age>20 and age<=50 THEN'20<age<=50'
            ELSE 'age>50 and age=NA' END AS age_category
     , COUNT(*)
  FROM titanic
 GROUP BY 1,2
 ORDER BY 2
""").show()

+------+-----------------+--------+
|   sex|     age_category|count(1)|
+------+-----------------+--------+
|female|       20<age<=50|     151|
|  male|       20<age<=50|     269|
|  male|          age<=20|      78|
|female|          age<=20|      67|
|  male|age>50 and age=NA|     503|
|female|age>50 and age=NA|     245|
+------+-----------------+--------+



#### Q3. Embarked locations and their count

In [12]:
(df
.groupBy("embarked")
.count()
.show(20,truncate=False)
)

+-----------+-----+
|embarked   |count|
+-----------+-----+
|null       |492  |
|Queenstown |45   |
|Southampton|573  |
|Cherbourg  |203  |
+-----------+-----+



#### Using SQL query

In [13]:
spark.sql("""
SELECT embarked
     , count(*) AS count
  FROM titanic
 GROUP BY 1
 ORDER BY 2
""").show()

+-----------+-----+
|   embarked|count|
+-----------+-----+
| Queenstown|   45|
|  Cherbourg|  203|
|       null|  492|
|Southampton|  573|
+-----------+-----+



#### Q4. Number of people survived in each class

In [14]:
(df
.filter("survived = 1")
.groupBy("pclass")
.count()
.show(10,truncate=False)
)

+------+-----+
|pclass|count|
+------+-----+
|2nd   |119  |
|1st   |193  |
|3rd   |137  |
+------+-----+



#### Using SQL query

In [15]:
spark.sql("""
SELECT pclass
     , COUNT(*) AS count
  FROM titanic
 WHERE survived = 1
 GROUP BY 1
 ORDER BY 2
""").show()

+------+-----+
|pclass|count|
+------+-----+
|   2nd|  119|
|   3rd|  137|
|   1st|  193|
+------+-----+



#### Q5. number of males survived whose age is less than 30 and travelling in 2nd class

In [16]:
(df
.filter("age < 30 and pclass = '2nd' and survived = 1 and sex = 'male'")
.count()
)

16

#### Using SQL query

In [17]:
spark.sql("""
SELECT COUNT(*) AS count
  FROM titanic
 WHERE age < 30
   AND pclass = '2nd'
   AND survived = 1
   AND sex = 'male'
   """).show()

+-----+
|count|
+-----+
|   16|
+-----+

