In [3]:
!pip install pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("TitanicAnalysis").getOrCreate()

# Load the CSV data using PySpark, indicating no header row
titanic_data = spark.read.csv("/content/TitanicData.csv", header=False)

# Assign descriptive column names based on your understanding of the data
titanic_data = titanic_data.withColumnRenamed("_c0", "Survived") \
                         .withColumnRenamed("_c1", "Pclass") \
                         .withColumnRenamed("_c2", "Name") \
                         .withColumnRenamed("_c3", "sex") \
                         .withColumnRenamed("_c4", "Age") \
                         .withColumnRenamed("_c5", "SibSp") \
                         .withColumnRenamed("_c6", "Parch") \
                         .withColumnRenamed("_c7", "Ticket") \
                         .withColumnRenamed("_c8", "Fare") \
                         .withColumnRenamed("_c9", "Cabin") \
                         .withColumnRenamed("_c10", "Embarked")


# Extract relevant columns
feature_data = titanic_data.select("Survived", "sex", "Age")

# Split data based on survival
survived_passanger = feature_data.filter(feature_data.Survived == 1)
died_passanger = feature_data.filter(feature_data.Survived == 0)

sur_avg_age = survived_passanger.groupBy("sex").agg(F.avg("Age").alias("avg_age")) \
                               .withColumn("category", F.concat(F.lit("Survived_"), F.col("sex")))
died_avg_age = died_passanger.groupBy("sex").agg(F.avg("Age").alias("avg_age")) \
                             .withColumn("category", F.concat(F.lit("Died_"), F.col("sex")))


# Merge data
merged_data = sur_avg_age.union(died_avg_age)

# Store the report in Colab's temporary directory (adjust as needed)
merged_data.write.csv("/tmp/aaa_report.txt", mode="overwrite")

# Load and display the report
report = spark.read.csv("/tmp/aaa_report.txt", header=True)
report.show()




Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=6c8ca8db5f40ae1d6947c569018d05c52046a5b7e77bb5e2344c203466308412
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
+------+------------------+---------------+
|female| 28.84771573604061|Survived_female|
+------+------------------+---------------+
|  male|27.276021505376345|  Survived_male|
|  male|31.618055555555557|      Died_male|
+------+------------------+---------------+



In [4]:
feature_data = titanic_data.select('Survived', 'Pclass')

# Split data into survived and died groups
died = feature_data.filter(F.col('Survived') == 0)
survived = feature_data.filter(F.col('Survived') == 1)

# Group by Pclass and count survivals and deaths
died_pclass_group = died.groupBy('Pclass').count()
survived_pclass_group = survived.groupBy('Pclass').count()

# Combine and format results
report_df = (
    died_pclass_group.withColumn('category', F.concat(F.lit('Died_Pclass_'), F.col('Pclass')))
      .union(survived_pclass_group.withColumn('category', F.concat(F.lit('Survived_Pclass_'), F.col('Pclass'))))
      .select('category', 'count')
)

# Save as text file
report_df.write.csv('/titanic_analysis/Pclass_analysis.txt')

# Optionally load and display the saved results
report_data = spark.read.csv('/titanic_analysis/Pclass_analysis.txt', header=True, inferSchema=True)
report_data.show()

+-----------------+---+
|Survived_Pclass_3|119|
+-----------------+---+
|Survived_Pclass_1|136|
|Survived_Pclass_2| 87|
|    Died_Pclass_1| 80|
|    Died_Pclass_2| 97|
+-----------------+---+

