In [0]:
# Load Titanic dataset into Spark DataFrame
titanic_df = spark.read.csv(
    "dbfs:/FileStore/halil/titanic.csv",
    header=True,
    inferSchema=True,
    nullValue="?"
)

display(titanic_df)  # Quick look in Databricks


2. Inspect schema and basic info

In [0]:
# Schema
titanic_df.printSchema()

# Row count
print(f"Number of rows: {titanic_df.count()}")

# Descriptive statistics
titanic_df.describe().show()


3. Clean & prepare data

In [0]:
from pyspark.sql.functions import col

# Select relevant columns & handle nulls
titanic_df = titanic_df.select(
    col("PassengerId"),
    col("Survived").cast("int"),
    col("Pclass").cast("int"),
    col("Name"),
    col("Sex"),
    col("Age").cast("double"),
    col("SibSp").cast("int"),
    col("Parch").cast("int"),
    col("Fare").cast("double"),
    col("Embarked")
)

# Drop rows with missing values in critical columns
titanic_df = titanic_df.dropna(subset=["Survived", "Pclass", "Sex", "Age", "Fare"])

titanic_df.show(5)


4. Survival rate

In [0]:
survival_rate = titanic_df.groupBy("Survived").count()
display(survival_rate)   # Databricks can plot as bar/pie

Databricks visualization. Run in Databricks to view.

5. Age distribution by survival

In [0]:
age_dist = titanic_df.groupBy("Survived").agg({"Age": "avg"})
display(age_dist)   # shows average age by survival


Databricks visualization. Run in Databricks to view.

6. Survival by Sex

In [0]:
survival_by_sex = titanic_df.groupBy("Sex").agg({"Survived": "avg"})
display(survival_by_sex)


Databricks visualization. Run in Databricks to view.

7. Survival by Age

In [0]:
survival_by_class = titanic_df.groupBy("Pclass").agg({"Survived": "avg"})
display(survival_by_class)


Databricks visualization. Run in Databricks to view.

8. Convert to Pandasto use Matplotlib and Seaborn

In [0]:
pdf = titanic_df.toPandas()
pdf.head()


9. Basic survival rate

In [0]:
import matplotlib.pyplot as plt

colors = ["lightcoral", "lightblue"]
survival_rate = pdf["Survived"].value_counts(normalize=True) #when normalize True, expect numbers less than 1

plt.figure(figsize=(5,4))
survival_rate.plot(kind="bar", color=colors)
plt.title("Overall Survival Rate")
plt.xticks([0,1], ["Did not survive", "Survived"], rotation=0)
plt.ylabel("Proportion")
plt.show()


10. Survival by sex

In [0]:
pdf.groupby("Sex")["Survived"].mean().plot(kind="bar", figsize=(5,4),color=colors)
plt.title("Survival Rate by Sex")
plt.ylabel("Proportion Survived")
plt.show()


11. Survival by passenger class

In [0]:
pdf.groupby("Pclass")["Survived"].mean().plot(kind="bar", figsize=(5,4), color = ["lightcoral", "lightblue", "lightgreen"])
plt.title("Survival Rate by Class")
plt.ylabel("Proportion Survived")
plt.show()


In [0]:
import seaborn as sns
sns.barplot(data=pdf, x="Pclass", y="Survived", hue="Sex", palette="Set2", errorbar=None)


In [0]:
sns.histplot(data=pdf, x="Age", hue="Sex", palette="Set2", kde=True)

Correlation Heatmap

In [0]:
plt.figure(figsize=(8,6))
sns.heatmap(
    pdf[["Survived","Pclass","Age","SibSp","Parch","Fare"]].corr(),
    annot=True, cmap="coolwarm", fmt=".2f"
)
plt.title("Correlation Heatmap")
plt.show()
