In [None]:
#Introduction
#In this project, we perform Big Data analysis using the Titanic passenger dataset.
#We use Apache PySpark to handle data at scale and demonstrate fundamental processing techniques such as
#cleaning, aggregation, and insights extraction.
#The analysis covers survival rates, passenger class distribution, and average age analysis.

In [1]:
# Install PySpark
!pip install pyspark



In [4]:
!wget https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv -O /content/titanic.csv


--2025-04-27 11:12:42--  https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60302 (59K) [text/plain]
Saving to: ‘/content/titanic.csv’


2025-04-27 11:12:42 (3.87 MB/s) - ‘/content/titanic.csv’ saved [60302/60302]



In [5]:
# Step 2: Create a Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Titanic Big Data Analysis").getOrCreate()

# Step 3: Load the Titanic dataset from the local file
df = spark.read.csv('/content/titanic.csv', header=True, inferSchema=True)

# Step 4: Data exploration (Check schema and first few rows)
df.printSchema()
df.show(5)
print(f"Total Records: {df.count()}")


root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|   

In [6]:


# Step 5: Data cleaning (drop rows with null 'Age' and 'Fare' > 0)
df_clean = df.dropna(subset=["Age"])
df_clean = df_clean.filter((col("Fare") > 0))

# Step 6: Data Analysis: Survival Rate by Gender
survival_by_gender = df_clean.groupBy("Sex") \
    .agg(
        avg("Survived").alias("survival_rate"),
        count("*").alias("num_passengers")
    ) \
    .orderBy(desc("survival_rate"))

survival_by_gender.show()

# Step 7: Data Analysis: Survival Rate by Passenger Class
survival_by_class = df_clean.groupBy("Pclass") \
    .agg(
        avg("Survived").alias("survival_rate"),
        count("*").alias("num_passengers")
    ) \
    .orderBy("Pclass")

survival_by_class.show()

# Step 8: Data Analysis: Average Age by Survival Status
average_age = df_clean.groupBy("Survived") \
    .agg(
        avg("Age").alias("average_age")
    )

average_age.show()




+------+------------------+--------------+
|   Sex|     survival_rate|num_passengers|
+------+------------------+--------------+
|female|0.7547892720306514|           261|
|  male|0.2062780269058296|           446|
+------+------------------+--------------+

+------+-------------------+--------------+
|Pclass|      survival_rate|num_passengers|
+------+-------------------+--------------+
|     1| 0.6666666666666666|           183|
|     2| 0.4797687861271676|           173|
|     3|0.23931623931623933|           351|
+------+-------------------+--------------+

+--------+------------------+
|Survived|       average_age|
+--------+------------------+
|       1|28.355259515570935|
|       0| 30.53708133971292|
+--------+------------------+



In [7]:
# Step 9: Insights
# - Insight 1: Females had a higher survival rate than males.
# - Insight 2: First class passengers had a higher survival rate.
# - Insight 3: Younger passengers tended to survive more than older ones.